InternVL Transformers 上的InternVL3.0模型推理代码是否不可用于InternVL3.5?

https://huggingface.co/docs/transformers/v4.55.4/en/model_doc/internvl#video-input 此处如果将模型换为InternVL3.5系列，对样例视频输入仍报错：shape '[8, 27, 13, 2048]' is invalid for input of size 5971968，如何解决，这是否是一个错误。

Aug 30 '25 05:08 Lee-xeo

请确保你使用的是包含-HF后缀的ckpt，例如：OpenGVLab/InternVL3_5-8B-HF

Aug 31 '25 04:08 Weiyun1025

Hi @Lee-xeo , have you resolved the issue? Even using the -HF suffix doesn't help.

Sep 07 '25 12:09 nimeshagrawal

@Weiyun1025 Can you provide the solution

Sep 07 '25 14:09 nimeshagrawal

@nimeshagrawal Hello, I implemented the inference using the official complex processing code, and although it doesn't look elegant, it works. (Note, you need to use the version without the HF suffix)

import argparse
import os
import json
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

class TestModel:
    def __init__(self,
                 model,
                 input_dir: str = './annotation/normal',
                 output_dir: str = './response',
                 thinking: bool = False
                 ):
        
        self.input_dir = input_dir
        self.video_meta_info_path = './annotation/video_meta_info.json'
        self.prompt_input_path = os.path.join(input_dir, 'prompt.json')
        
        self.model_name = model
        
        self.output_dir = output_dir
        self.thinking = thinking
        if self.thinking:
            self.response_output_path = os.path.join(output_dir, f'{self.model_name}_thinking_response.json')
        else:
            self.response_output_path = os.path.join(output_dir, f'{self.model_name}_nothinking_response.json')

        # 创建输出目录
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        self.meta_prompt_file = "meta_prompt/test_vlm_meta_prompt.txt"
        
        # 加载InternVL模型
        model_path = f"./models/InternVL/{model}"
        self.model = AutoModel.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            load_in_8bit=False,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            device_map="auto"
        ).eval()
        
        if self.thinking:
            R1_SYSTEM_PROMPT = """
            You are an AI assistant that rigorously follows this response protocol:

            1. First, conduct a detailed analysis of the question. Consider different angles, potential solutions, and reason through the problem step-by-step. Enclose this entire thinking process within <think> and </think> tags.

            2. After the thinking section, provide a clear, concise, and direct answer to the user's question. Separate the answer from the think section with a newline.

            Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
            """.strip()

            self.model.system_message = R1_SYSTEM_PROMPT
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_path, 
            trust_remote_code=True, 
            use_fast=False
        )
        
        print(f"成功加载模型: {model}")
        print(f"检测到 {torch.cuda.device_count()} 个GPU")
        
        # 生成配置
        if self.thinking:
            self.temperature = 0.6
            self.max_new_tokens = 8192
        else:
            self.temperature = 0.1
            self.max_new_tokens = 2048
        self.generation_config = dict(
            max_new_tokens=self.max_new_tokens, 
            do_sample=True,
            temperature=self.temperature,
            top_p=0.001,
            repetition_penalty=1.05
        )
        
        # 视频处理配置
        self.input_size = 448  # 分辨率处理为448*448
        self.fps = 1.0  # 按1fps采帧
        self.max_num = 1  # 每帧的最大块数
        
    def process_video_with_internvl(self, video_path: str, meta_prompt: str, prompt: str):
        """
        使用 InternVL 处理视频。
        """
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"视频文件未找到: {video_path}")
        
        # 读取视频并获取帧
        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
        video_fps = float(vr.get_avg_fps())
        total_frames = len(vr) - 1
        
        # 根据2fps计算采样帧数
        duration = total_frames / video_fps
        num_segments = max(1, int(duration * self.fps))
        
        # 加载视频帧
        pixel_values, num_patches_list = load_video(
            video_path, 
            bound=None, 
            input_size=self.input_size, 
            max_num=self.max_num, 
            num_segments=num_segments
        )
        
        pixel_values = pixel_values.to(torch.bfloat16).cuda()
        
        # 构建视频前缀
        video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
        
        # 构建完整的提示
        full_prompt = f"{meta_prompt}\n{prompt}\n{video_prefix}"
        
        # 使用模型进行推理
        try:
            response, history = self.model.chat(
                self.tokenizer, 
                pixel_values, 
                full_prompt, 
                self.generation_config,
                num_patches_list=num_patches_list, 
                history=None, 
                return_history=True
            )
            if self.thinking:
                # 提取最终答案
                if '</think>' in response:
                    response = response.split('</think>', 1)[-1].strip()
                else:
                    response = response.strip()
            return response
        except Exception as e:
            print(f"模型推理过程中发生错误: {str(e)}")
            raise e

Sep 10 '25 05:09 Lee-xeo

@Lee-xeo Thanks for sharing. But is there a way to make it work with -HF version too? Because as per documentation in this: https://huggingface.co/docs/transformers/v4.55.4/en/model_doc/internvl#video-input, it does not work.

Sep 10 '25 05:09 nimeshagrawal

我也遇到了同样的问题，使用-HF结尾的模型权重推理视频时，会遇到： RuntimeError: shape '[120, 27, 13, 2048]' is invalid for input of size 89579520

Sep 25 '25 15:09 Rio-Allen

change config in file 'video_preprocessor_config.json' to "size" 448 and it will ok.

Nov 13 '25 06:11 nguyen-khang-ntq

@nguyen-khang-ntq There is another alternative to fix this bug. While processing the messages, we can use the following:

inputs = processor.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True, tokenize=True, return_dict=True, fps=fps, size={"height": height, "width": width} ).to(model.device, dtype=torch.bfloat16)

where height and width needs to be 448. With this method, it can be even inferred using FPS of choice.

Nov 13 '25 06:11 nimeshagrawal

@nimeshagrawal i did this and it worked

Nov 13 '25 07:11 nguyen-khang-ntq

Related issue: https://github.com/OpenGVLab/InternVL/issues/1222

Nov 13 '25 07:11 nguyen-khang-ntq