InternVL [Bug] VL2.5-8B有bug

Checklist

[ ] 1. I have searched related issues but cannot get the expected help.
[ ] 2. The bug has not been fixed in the latest version.
[ ] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.

Describe the bug

1）用lmdeploy推理结果： 2）用transformers推理发现效果十分差：表格有些值识别错误。

可以看出transformers推理有严重的bug

Reproduction

脚本运行

Environment

transformers = 4.46

Error traceback

以下是transformers推理的代码

import random
import string
import os

import numpy as np
import torch
import math
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
from flask import Flask, request,  jsonify
from flask_cors import CORS
import json
import time
import fitz
import requests
import random
import string
import io
import base64
from io import BytesIO
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}


IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.integer):
            return int(obj)
        elif isinstance(obj, numpy.floating):
            return float(obj)
        elif isinstance(obj, numpy.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

def pdf2img(pdf_file):
    doc = fitz.open(stream=pdf_file, filetype='pdf')
    images = []
    for page in doc:
        zoom_x = 2
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)
        # print("type", type(pix))
        img_dir = 'img_dir/'
        pix.save(r"{}page-{}.png".format(img_dir, page.number))
        images.append(r"{}page-{}.png".format(img_dir, page.number))
    return images

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    num_layers = {
        'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
        'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

path = '/mnt/data/spdi-code/IDCAI/Qwen/InternVL2_5-8B'
device_map = split_model('InternVL2_5-8B')

model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map=device_map).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)


def get_quota_v1(text):
    result = []
    # text = text.split('\n')

    pattern = r'[a-zA-Z0-9]+-[0-9]+'

    result = re.findall(pattern, text)
    return set(result)


def get_quota_v2(text, mark):
    quota_lst = text.split('\n')
    quota_res = []
    for quota_info in quota_lst:
        res = {}
        print("--------------------")
        tmp = quota_info.split('，')
        print("tmp:", tmp)
        try:
            quota_code = tmp[0].split('：')[1]
            #res['定额编号'] = quota_code
            res['quota_id'] = quota_code
            res['mark'] = mark
            if len(get_quota_v1(quota_code)) > 0:
                try:
                    quota_num = tmp[1].split('：')[1]
                    #res[tmp[1].split('：')[0]] = quota_num
                    res['quota_tot'] = quota_num
                except:
                    continue
                try:
                    quota_val1 = tmp[2].split('：')[1]
                    #res[tmp[2].split('：')[0]] = quota_val1
                    res['val1'] = quota_val1
                except:
                    continue
    
                try:
                    quota_val2 = tmp[3].split('：')[1]
                    #res[tmp[3].split('：')[0]] = quota_val2
                    res['val2'] = quota_val2
                except:
                    continue

                try:
                    quota_val3 = tmp[4].split('：')[1]
                    #res[tmp[4].split('：')[0]] = quota_val3
                    res['tot_val1'] = quota_val3
                except:
                    continue

                try:
                    quota_val4 = tmp[5].split('：')[1]
                    #res[tmp[5].split('：')[0]] = quota_val4
                    res['tot_val2'] = quota_val4
                except:
                    continue
                quota_res.append(res)
            else:
                continue
        except:
            continue
    # print("定额编号：", quota_code)
    # print("数值：", quota_val1)
    # print("数值：", quota_val2)


    return quota_res


def get_quota(text):
    quota_lst = text.split('\n')
    res = []
    for quota in quota_lst:
        tmp = quota.split('：')
        if len(tmp) < 2:
            continue
        else:
            quota_id = tmp[0]
            quota_name = tmp[1]
            if len(get_quota_v1(quota_id)) != 0:
                res.append({'定额编号': quota_id, '单价': quota_name})
    return res


def pdf2img(pdf_file):
    doc = fitz.open(pdf_file, filetype='pdf')
    images = []
    for page in doc:
        zoom_x = 3.0
        zoom_y = 3.0
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)
        # print("type", type(pix))
        img_dir = 'img_dir/'
        pix.save(r"{}page-{}.png".format(img_dir, page.number))
        images.append(r"{}page-{}.png".format(img_dir, page.number))
    return images


# model_dir = "/mnt/data/spdi-code/VLModel/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4"
# model_dir = '/home/spdi-code/quota_api/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4'
# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#    model_dir='qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4', torch_dtype="auto", device_map="auto"
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.


# reranker = FlagReranker('/mnt/data/spdi-code/model_api/bge-reranker-v2-m3', use_fp16=True)
# default processer
# processor = AutoProcessor.from_pretrained(model_dir)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28

images = pdf2img('设计预算表2.pdf')
# question = "请列出单体建筑的建筑名称、面积面积和层数。"
# question = "该文件是否有单体建筑的信息，若有请列出单体建筑的建筑名称、面积面积和层数。若无则输出'none'。"

#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-8B'
#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-26B'
#pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))


def check_quota(image_dir, question):
    #image = load_image(image_dir)
    #response = pipe((question, image_dir))
    pixel_values = load_image(image_dir, max_num=12).to(torch.bfloat16)
    generation_config = dict(max_new_tokens=1024, do_sample=True)

    question = '<image>\n{}'.format(question)
    response = model.chat(tokenizer, pixel_values, question, generation_config)

    return response

    #return response.text


def evaluate_quota(quota_info):
    print("ai识别数量：",len(quota_info))
    res  = 0
    for quota in quota_info:
        try:
            
            tot = float(quota['quota_tot'])
            x1 = float(quota['val1'])
            x2 = float(quota['val2'])
            y1 = float(quota['tot_val1'])
            y2 = float(quota['tot_val2'])
            
            if quota['mark'] == 0: 
                #tot = float(quota['quota_tot'])
            
                if round(x1 * tot, 0) == round(y1, 0) and round(x2 * tot, 0) == round(y2, 0):
                    print("合计值正确！")
                    res += 1
                else:
                    print(quota)
                    print("合计值错误!", tot, x1, x1 * tot, y1)
                    print("----合计值!", tot, x2, x2 * tot, y2)
            else:

                if round(x2 * y1, 0) == round(y2, 0) and round(x1 * tot, 0) == round(y1, 0):
                    print("合计值正确！")
                    res += 1
                else:
                    print(quota)
                    print("---, ", round(x2 * y1, 0), round(y2, 0))
                    print("---, ", round(x1 * tot, 0), round(y1, 0))

        except:
            print("报错", quota)
            continue
        
    print("正确数量", res)

res = {}
question1 = "该页有单位定额值（工日）的内容么？"
question2 = "该页有消耗量（台班）的内容么？"
question3 = "请将定额编号及其对应的数量，单位定额值（工日）中技工和普工以及合计值（工日）中的技工和普工内容提取出来，遇到技工或者普工为空值，则默认数值为0。结果输出样例为：'定额编号：XXX，数量：XXX，技工（单位定额）：XXX，普工（单位定额）：XXX，技工（合计值）：XXX，普工（合计值）：XXX'。不要输出多余内容。"
question4 = "请将定额编号及其对应的数量，单位定额值中消耗量（台班）和单价（元）以及合计值中消耗量（台班）和合价（元）提取出来。结果输出样例为：'定额编号：XXX，数量：XXX，消耗量（单位定额）：XXX，单价（单位定额）：XXX，消耗量（合计值）：XXX，合价（合计值）：XXX'。不要输出多余内容。"

quota_info = []

for image in images:
    print("路径", image)

    res = check_quota(image, question1)
    if '没有' not in res:
        res = check_quota(image, question3)
        print("res---", res)
        res = get_quota_v2(res, 0)
        print("res---", res)
        # print("定额内容：", res)
        print("定额内容: ", res)
        quota_info += res
    else:
        res = check_quota(image, question2)
        if '没有' in res:
            continue
        else:
            res = check_quota(image, question4)
            print("res---", res)
            res = get_quota_v2(res, 1)
            print("res---", res)
            # print("定额内容：", res)
            print("定额内容：", res)
            quota_info += res
print("提取到的定额信息如下:", quota_info)

evaluate_quota(quota_info)

# start_time = time.time()

"""
    if len(get_quota(output_text[0])) != 0:
        if len(res) == 0:
            res = get_quota(output_text[0])
        else:
            res.update(get_quota(output_text[0]))


print("结果：", res)

"""



只要把transformers改成lmdeploy效果就挺好的，但是lmdeploy无法支持多线程