InternVL icon indicating copy to clipboard operation
InternVL copied to clipboard

[Bug] VL2.5-8B有bug

Open whysirier opened this issue 1 year ago • 3 comments

Checklist

  • [ ] 1. I have searched related issues but cannot get the expected help.
  • [ ] 2. The bug has not been fixed in the latest version.
  • [ ] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.

Describe the bug

1)用lmdeploy推理结果: image 2)用transformers推理发现效果十分差: image 表格有些值识别错误。

可以看出transformers推理有严重的bug

Reproduction

脚本运行

Environment

transformers = 4.46

Error traceback

以下是transformers推理的代码

import random
import string
import os

import numpy as np
import torch
import math
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
from flask import Flask, request,  jsonify
from flask_cors import CORS
import json
import time
import fitz
import requests
import random
import string
import io
import base64
from io import BytesIO
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}


IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.integer):
            return int(obj)
        elif isinstance(obj, numpy.floating):
            return float(obj)
        elif isinstance(obj, numpy.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

def pdf2img(pdf_file):
    doc = fitz.open(stream=pdf_file, filetype='pdf')
    images = []
    for page in doc:
        zoom_x = 2
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)
        # print("type", type(pix))
        img_dir = 'img_dir/'
        pix.save(r"{}page-{}.png".format(img_dir, page.number))
        images.append(r"{}page-{}.png".format(img_dir, page.number))
    return images

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    num_layers = {
        'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
        'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

path = '/mnt/data/spdi-code/IDCAI/Qwen/InternVL2_5-8B'
device_map = split_model('InternVL2_5-8B')

model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map=device_map).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)


def get_quota_v1(text):
    result = []
    # text = text.split('\n')

    pattern = r'[a-zA-Z0-9]+-[0-9]+'

    result = re.findall(pattern, text)
    return set(result)


def get_quota_v2(text, mark):
    quota_lst = text.split('\n')
    quota_res = []
    for quota_info in quota_lst:
        res = {}
        print("--------------------")
        tmp = quota_info.split(',')
        print("tmp:", tmp)
        try:
            quota_code = tmp[0].split(':')[1]
            #res['定额编号'] = quota_code
            res['quota_id'] = quota_code
            res['mark'] = mark
            if len(get_quota_v1(quota_code)) > 0:
                try:
                    quota_num = tmp[1].split(':')[1]
                    #res[tmp[1].split(':')[0]] = quota_num
                    res['quota_tot'] = quota_num
                except:
                    continue
                try:
                    quota_val1 = tmp[2].split(':')[1]
                    #res[tmp[2].split(':')[0]] = quota_val1
                    res['val1'] = quota_val1
                except:
                    continue
    
                try:
                    quota_val2 = tmp[3].split(':')[1]
                    #res[tmp[3].split(':')[0]] = quota_val2
                    res['val2'] = quota_val2
                except:
                    continue

                try:
                    quota_val3 = tmp[4].split(':')[1]
                    #res[tmp[4].split(':')[0]] = quota_val3
                    res['tot_val1'] = quota_val3
                except:
                    continue

                try:
                    quota_val4 = tmp[5].split(':')[1]
                    #res[tmp[5].split(':')[0]] = quota_val4
                    res['tot_val2'] = quota_val4
                except:
                    continue
                quota_res.append(res)
            else:
                continue
        except:
            continue
    # print("定额编号:", quota_code)
    # print("数值:", quota_val1)
    # print("数值:", quota_val2)


    return quota_res


def get_quota(text):
    quota_lst = text.split('\n')
    res = []
    for quota in quota_lst:
        tmp = quota.split(':')
        if len(tmp) < 2:
            continue
        else:
            quota_id = tmp[0]
            quota_name = tmp[1]
            if len(get_quota_v1(quota_id)) != 0:
                res.append({'定额编号': quota_id, '单价': quota_name})
    return res


def pdf2img(pdf_file):
    doc = fitz.open(pdf_file, filetype='pdf')
    images = []
    for page in doc:
        zoom_x = 3.0
        zoom_y = 3.0
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)
        # print("type", type(pix))
        img_dir = 'img_dir/'
        pix.save(r"{}page-{}.png".format(img_dir, page.number))
        images.append(r"{}page-{}.png".format(img_dir, page.number))
    return images


# model_dir = "/mnt/data/spdi-code/VLModel/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4"
# model_dir = '/home/spdi-code/quota_api/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4'
# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#    model_dir='qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4', torch_dtype="auto", device_map="auto"
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.


# reranker = FlagReranker('/mnt/data/spdi-code/model_api/bge-reranker-v2-m3', use_fp16=True)
# default processer
# processor = AutoProcessor.from_pretrained(model_dir)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28

images = pdf2img('设计预算表2.pdf')
# question = "请列出单体建筑的建筑名称、面积面积和层数。"
# question = "该文件是否有单体建筑的信息,若有请列出单体建筑的建筑名称、面积面积和层数。若无则输出'none'。"

#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-8B'
#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-26B'
#pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))


def check_quota(image_dir, question):
    #image = load_image(image_dir)
    #response = pipe((question, image_dir))
    pixel_values = load_image(image_dir, max_num=12).to(torch.bfloat16)
    generation_config = dict(max_new_tokens=1024, do_sample=True)

    question = '<image>\n{}'.format(question)
    response = model.chat(tokenizer, pixel_values, question, generation_config)

    return response

    #return response.text


def evaluate_quota(quota_info):
    print("ai识别数量:",len(quota_info))
    res  = 0
    for quota in quota_info:
        try:
            
            tot = float(quota['quota_tot'])
            x1 = float(quota['val1'])
            x2 = float(quota['val2'])
            y1 = float(quota['tot_val1'])
            y2 = float(quota['tot_val2'])
            
            if quota['mark'] == 0: 
                #tot = float(quota['quota_tot'])
            
                if round(x1 * tot, 0) == round(y1, 0) and round(x2 * tot, 0) == round(y2, 0):
                    print("合计值正确!")
                    res += 1
                else:
                    print(quota)
                    print("合计值错误!", tot, x1, x1 * tot, y1)
                    print("----合计值!", tot, x2, x2 * tot, y2)
            else:

                if round(x2 * y1, 0) == round(y2, 0) and round(x1 * tot, 0) == round(y1, 0):
                    print("合计值正确!")
                    res += 1
                else:
                    print(quota)
                    print("---, ", round(x2 * y1, 0), round(y2, 0))
                    print("---, ", round(x1 * tot, 0), round(y1, 0))

        except:
            print("报错", quota)
            continue
        
    print("正确数量", res)

res = {}
question1 = "该页有单位定额值(工日)的内容么?"
question2 = "该页有消耗量(台班)的内容么?"
question3 = "请将定额编号及其对应的数量,单位定额值(工日)中技工和普工以及合计值(工日)中的技工和普工内容提取出来,遇到技工或者普工为空值,则默认数值为0。结果输出样例为:'定额编号:XXX,数量:XXX,技工(单位定额):XXX,普工(单位定额):XXX,技工(合计值):XXX,普工(合计值):XXX'。不要输出多余内容。"
question4 = "请将定额编号及其对应的数量,单位定额值中消耗量(台班)和单价(元)以及合计值中消耗量(台班)和合价(元)提取出来。结果输出样例为:'定额编号:XXX,数量:XXX,消耗量(单位定额):XXX,单价(单位定额):XXX,消耗量(合计值):XXX,合价(合计值):XXX'。不要输出多余内容。"

quota_info = []

for image in images:
    print("路径", image)

    res = check_quota(image, question1)
    if '没有' not in res:
        res = check_quota(image, question3)
        print("res---", res)
        res = get_quota_v2(res, 0)
        print("res---", res)
        # print("定额内容:", res)
        print("定额内容: ", res)
        quota_info += res
    else:
        res = check_quota(image, question2)
        if '没有' in res:
            continue
        else:
            res = check_quota(image, question4)
            print("res---", res)
            res = get_quota_v2(res, 1)
            print("res---", res)
            # print("定额内容:", res)
            print("定额内容:", res)
            quota_info += res
print("提取到的定额信息如下:", quota_info)

evaluate_quota(quota_info)

# start_time = time.time()

"""
    if len(get_quota(output_text[0])) != 0:
        if len(res) == 0:
            res = get_quota(output_text[0])
        else:
            res.update(get_quota(output_text[0]))


print("结果:", res)

"""



只要把transformers改成lmdeploy效果就挺好的,但是lmdeploy无法支持多线程

whysirier avatar Dec 24 '24 07:12 whysirier

在我的推理过程中,有相当多的“r”。。。不知道为什么 不论是用 lmdeploy,还是用 transformers 关于 8B 的模型截图没有,8B 的模型真的是一堆“r”,甚至我以为 26B 的模型会没有这个现象,但还是存在,如图下所示。

Image

dongwhfdyer avatar Jan 17 '25 11:01 dongwhfdyer

在我的推理过程中,有相当多的“r”。。。不知道为什么 不论是用 lmdeploy,还是用 transformers 关于 8B 的模型截图没有,8B 的模型真的是一堆“r”,甚至我以为 26B 的模型会没有这个现象,但还是存在,如图下所示。

Image

我也遇到了类似的问题,使用1B时效果反而更高

josephzpng avatar Jan 31 '25 13:01 josephzpng

你好,请问是否有尝试过transformers 3.37.2版本?

yuecao0119 avatar Feb 25 '25 07:02 yuecao0119

@whysirier 我是小白,请问你这个东西看起来有标准的模版啊,直接批量返回匹配不行吗? 为什么全部用大模型?

babyhyf avatar May 15 '25 06:05 babyhyf