InternVL
InternVL copied to clipboard
[Bug] VL2.5-8B有bug
Checklist
- [ ] 1. I have searched related issues but cannot get the expected help.
- [ ] 2. The bug has not been fixed in the latest version.
- [ ] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
Describe the bug
1)用lmdeploy推理结果:
2)用transformers推理发现效果十分差:
表格有些值识别错误。
可以看出transformers推理有严重的bug
Reproduction
脚本运行
Environment
transformers = 4.46
Error traceback
以下是transformers推理的代码
import random
import string
import os
import numpy as np
import torch
import math
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
from flask import Flask, request, jsonify
from flask_cors import CORS
import json
import time
import fitz
import requests
import random
import string
import io
import base64
from io import BytesIO
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, numpy.integer):
return int(obj)
elif isinstance(obj, numpy.floating):
return float(obj)
elif isinstance(obj, numpy.ndarray):
return obj.tolist()
else:
return super(MyEncoder, self).default(obj)
def pdf2img(pdf_file):
doc = fitz.open(stream=pdf_file, filetype='pdf')
images = []
for page in doc:
zoom_x = 2
zoom_y = 2
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.get_pixmap(matrix=mat)
# print("type", type(pix))
img_dir = 'img_dir/'
pix.save(r"{}page-{}.png".format(img_dir, page.number))
images.append(r"{}page-{}.png".format(img_dir, page.number))
return images
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
num_layers = {
'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
path = '/mnt/data/spdi-code/IDCAI/Qwen/InternVL2_5-8B'
device_map = split_model('InternVL2_5-8B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
def get_quota_v1(text):
result = []
# text = text.split('\n')
pattern = r'[a-zA-Z0-9]+-[0-9]+'
result = re.findall(pattern, text)
return set(result)
def get_quota_v2(text, mark):
quota_lst = text.split('\n')
quota_res = []
for quota_info in quota_lst:
res = {}
print("--------------------")
tmp = quota_info.split(',')
print("tmp:", tmp)
try:
quota_code = tmp[0].split(':')[1]
#res['定额编号'] = quota_code
res['quota_id'] = quota_code
res['mark'] = mark
if len(get_quota_v1(quota_code)) > 0:
try:
quota_num = tmp[1].split(':')[1]
#res[tmp[1].split(':')[0]] = quota_num
res['quota_tot'] = quota_num
except:
continue
try:
quota_val1 = tmp[2].split(':')[1]
#res[tmp[2].split(':')[0]] = quota_val1
res['val1'] = quota_val1
except:
continue
try:
quota_val2 = tmp[3].split(':')[1]
#res[tmp[3].split(':')[0]] = quota_val2
res['val2'] = quota_val2
except:
continue
try:
quota_val3 = tmp[4].split(':')[1]
#res[tmp[4].split(':')[0]] = quota_val3
res['tot_val1'] = quota_val3
except:
continue
try:
quota_val4 = tmp[5].split(':')[1]
#res[tmp[5].split(':')[0]] = quota_val4
res['tot_val2'] = quota_val4
except:
continue
quota_res.append(res)
else:
continue
except:
continue
# print("定额编号:", quota_code)
# print("数值:", quota_val1)
# print("数值:", quota_val2)
return quota_res
def get_quota(text):
quota_lst = text.split('\n')
res = []
for quota in quota_lst:
tmp = quota.split(':')
if len(tmp) < 2:
continue
else:
quota_id = tmp[0]
quota_name = tmp[1]
if len(get_quota_v1(quota_id)) != 0:
res.append({'定额编号': quota_id, '单价': quota_name})
return res
def pdf2img(pdf_file):
doc = fitz.open(pdf_file, filetype='pdf')
images = []
for page in doc:
zoom_x = 3.0
zoom_y = 3.0
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.get_pixmap(matrix=mat)
# print("type", type(pix))
img_dir = 'img_dir/'
pix.save(r"{}page-{}.png".format(img_dir, page.number))
images.append(r"{}page-{}.png".format(img_dir, page.number))
return images
# model_dir = "/mnt/data/spdi-code/VLModel/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4"
# model_dir = '/home/spdi-code/quota_api/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4'
# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
# model_dir='qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4', torch_dtype="auto", device_map="auto"
# )
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# reranker = FlagReranker('/mnt/data/spdi-code/model_api/bge-reranker-v2-m3', use_fp16=True)
# default processer
# processor = AutoProcessor.from_pretrained(model_dir)
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
images = pdf2img('设计预算表2.pdf')
# question = "请列出单体建筑的建筑名称、面积面积和层数。"
# question = "该文件是否有单体建筑的信息,若有请列出单体建筑的建筑名称、面积面积和层数。若无则输出'none'。"
#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-8B'
#model = '/mnt/data/spdi-code/SpdiChat/dataroot/models/InternVL2_5-26B'
#pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
def check_quota(image_dir, question):
#image = load_image(image_dir)
#response = pipe((question, image_dir))
pixel_values = load_image(image_dir, max_num=12).to(torch.bfloat16)
generation_config = dict(max_new_tokens=1024, do_sample=True)
question = '<image>\n{}'.format(question)
response = model.chat(tokenizer, pixel_values, question, generation_config)
return response
#return response.text
def evaluate_quota(quota_info):
print("ai识别数量:",len(quota_info))
res = 0
for quota in quota_info:
try:
tot = float(quota['quota_tot'])
x1 = float(quota['val1'])
x2 = float(quota['val2'])
y1 = float(quota['tot_val1'])
y2 = float(quota['tot_val2'])
if quota['mark'] == 0:
#tot = float(quota['quota_tot'])
if round(x1 * tot, 0) == round(y1, 0) and round(x2 * tot, 0) == round(y2, 0):
print("合计值正确!")
res += 1
else:
print(quota)
print("合计值错误!", tot, x1, x1 * tot, y1)
print("----合计值!", tot, x2, x2 * tot, y2)
else:
if round(x2 * y1, 0) == round(y2, 0) and round(x1 * tot, 0) == round(y1, 0):
print("合计值正确!")
res += 1
else:
print(quota)
print("---, ", round(x2 * y1, 0), round(y2, 0))
print("---, ", round(x1 * tot, 0), round(y1, 0))
except:
print("报错", quota)
continue
print("正确数量", res)
res = {}
question1 = "该页有单位定额值(工日)的内容么?"
question2 = "该页有消耗量(台班)的内容么?"
question3 = "请将定额编号及其对应的数量,单位定额值(工日)中技工和普工以及合计值(工日)中的技工和普工内容提取出来,遇到技工或者普工为空值,则默认数值为0。结果输出样例为:'定额编号:XXX,数量:XXX,技工(单位定额):XXX,普工(单位定额):XXX,技工(合计值):XXX,普工(合计值):XXX'。不要输出多余内容。"
question4 = "请将定额编号及其对应的数量,单位定额值中消耗量(台班)和单价(元)以及合计值中消耗量(台班)和合价(元)提取出来。结果输出样例为:'定额编号:XXX,数量:XXX,消耗量(单位定额):XXX,单价(单位定额):XXX,消耗量(合计值):XXX,合价(合计值):XXX'。不要输出多余内容。"
quota_info = []
for image in images:
print("路径", image)
res = check_quota(image, question1)
if '没有' not in res:
res = check_quota(image, question3)
print("res---", res)
res = get_quota_v2(res, 0)
print("res---", res)
# print("定额内容:", res)
print("定额内容: ", res)
quota_info += res
else:
res = check_quota(image, question2)
if '没有' in res:
continue
else:
res = check_quota(image, question4)
print("res---", res)
res = get_quota_v2(res, 1)
print("res---", res)
# print("定额内容:", res)
print("定额内容:", res)
quota_info += res
print("提取到的定额信息如下:", quota_info)
evaluate_quota(quota_info)
# start_time = time.time()
"""
if len(get_quota(output_text[0])) != 0:
if len(res) == 0:
res = get_quota(output_text[0])
else:
res.update(get_quota(output_text[0]))
print("结果:", res)
"""
只要把transformers改成lmdeploy效果就挺好的,但是lmdeploy无法支持多线程
在我的推理过程中,有相当多的“r”。。。不知道为什么 不论是用 lmdeploy,还是用 transformers 关于 8B 的模型截图没有,8B 的模型真的是一堆“r”,甚至我以为 26B 的模型会没有这个现象,但还是存在,如图下所示。
在我的推理过程中,有相当多的“r”。。。不知道为什么 不论是用 lmdeploy,还是用 transformers 关于 8B 的模型截图没有,8B 的模型真的是一堆“r”,甚至我以为 26B 的模型会没有这个现象,但还是存在,如图下所示。
![]()
我也遇到了类似的问题,使用1B时效果反而更高
你好,请问是否有尝试过transformers 3.37.2版本?
@whysirier 我是小白,请问你这个东西看起来有标准的模版啊,直接批量返回匹配不行吗? 为什么全部用大模型?