VLMEvalKit
VLMEvalKit copied to clipboard
KeyError: 'answer'
File "/root/.local/lib/python3.12/site-packages/vlmeval/dataset/image_mcq.py", line 181, in evaluate answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])} ~~~~^^^^^^^^^^ File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/frame.py", line 4090, in getitem indexer = self.columns.get_loc(key) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc raise KeyError(key) from err KeyError: 'answer'
Hello, how to resolve this issue when eval on MME
Hi, @lucasjinreal , I failed to reproduce this problem with MME and llava_v1.5_7b, would you please help provide a specific command to reproduce the error?
I am uncertain of the cause as well. Do you have any knowledge regarding the reason that leads to a key error? I have deleted the previous downloaded mme file but still encounter the same error. I have not conducted a comprehensive inspection of the vlmevalkit code, making it challenging to investigate where or which element could be the cause of this, yet I continue to encounter it.
I am uncertain of the cause as well. Do you have any knowledge regarding the reason that leads to a key error? I have deleted the previous downloaded mme file but still encounter the same error. I have not conducted a comprehensive inspection of the vlmevalkit code, making it challenging to investigate where or which element could be the cause of this, yet I continue to encounter it.
There are two things unusual according to the log
- MME is recognized as ImageMCQDataset, but actually it belongs to the ImageYORNDataset.
- The
answercolumn is in the tsv file, but it says theanswercolumn is missing.
Once again, I would like to emphasize that all data files used in VLMEvalKit should be automatically downloaded (for example, links of MME is in https://github.com/open-compass/VLMEvalKit/blob/main/vlmeval/dataset/image_yorn.py). Please make sure you are using data files downloaded from this link.
Yes, the MME are download automatically, so that the reason could possiably by recognized as ImageMCQDataset.
However, isn't this done by automatically in VLMEvalkit? I didn't do anything outside VLMEvalkit, how to make sure it was correctly treated?
Yes, the MME are download automatically, so that the reason could possiably by recognized as ImageMCQDataset.
However, isn't this done by automatically in VLMEvalkit? I didn't do anything outside VLMEvalkit, how to make sure it was correctly treated?
Would you please provide the exact command that triggers this error? Let me first try to reproduce it
I am not sure it can be reproduced or not. This is command:
eval_datasets='MMBench_DEV_CN MMStar ChartQA_TEST Q-Bench1_VAL MMMU_VAL POPE HallusionBench RealWorldQA MMBench_TEST_EN MMBench_TEST_CN MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MME'
torchrun --nproc-per-node=$num_gpus --master-port=5699 run_vlmevalkit2.py --data $eval_datasets --model llava_qwen_7b_lora --verbose
this always fail at MME so I put it at last. Everything else just fine.
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, nargs='+', required=True)
parser.add_argument('--model', type=str, nargs='+', required=True)
# Args that only apply to Video Dataset
parser.add_argument('--nframe', type=int, default=8)
parser.add_argument('--pack', action='store_true')
parser.add_argument('--work-dir', type=str, default='./eval_results/', help='select the output directory')
parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer'])
parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling')
parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
parser.add_argument('--judge', type=str, default=None)
parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--rerun', action='store_true')
args = parser.parse_args()
return args
def main():
logger = get_logger('RUN')
args = parse_args()
assert len(args.data), '--data should be a list of data files'
if args.retry is not None:
for k, v in supported_VLM.items():
if hasattr(v, 'keywords') and 'retry' in v.keywords:
v.keywords['retry'] = args.retry
supported_VLM[k] = v
if hasattr(v, 'keywords') and 'verbose' in v.keywords:
v.keywords['verbose'] = args.verbose
supported_VLM[k] = v
rank, world_size = get_rank_and_world_size()
if world_size > 1:
torch.cuda.set_device(rank)
dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=10800))
for _, model_name in enumerate(args.model):
model = None
pred_root = osp.join(args.work_dir, model_name)
os.makedirs(pred_root, exist_ok=True)
for _, dataset_name in enumerate(args.data):
dataset_kwargs = {}
if dataset_name == 'MMBench-Video':
dataset_kwargs['pack'] = args.pack
# If distributed, first build the dataset on the main process for doing preparation works
if world_size > 1:
dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None
dist.barrier()
dataset = build_dataset(dataset_name, **dataset_kwargs)
if dataset is None:
logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ')
continue
result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
if dataset_name in ['MMBench-Video']:
packstr = 'pack' if args.pack else 'nopack'
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
if osp.exists(result_file) and args.rerun:
for keyword in ['openai', 'gpt', 'auxmatch']:
os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*')
if model is None:
model = model_name # which is only a name
# Perform the Inference
if dataset_name == 'MMBench-Video':
model = infer_data_job_video(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
nframe=args.nframe,
pack=args.pack,
verbose=args.verbose,
api_nproc=args.nproc)
else:
model = infer_data_job(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
verbose=args.verbose,
api_nproc=args.nproc,
ignore_failed=args.ignore)
# Set the judge kwargs first before evaluation or dumping
judge_kwargs = {
'nproc': args.nproc,
'verbose': args.verbose,
}
if args.retry is not None:
judge_kwargs['retry'] = args.retry
if args.judge is not None:
judge_kwargs['model'] = args.judge
else:
if dataset.TYPE in ['MCQ', 'Y/N']:
judge_kwargs['model'] = 'chatgpt-0125'
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video'], dataset_name):
judge_kwargs['model'] = 'gpt-4-turbo'
if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
if rank == 0:
if dataset_name in ['MMMU_TEST']:
result_json = MMMU_result_transfer(result_file)
logger.info(f'Transfer MMMU_TEST result to json for official evaluation, json file saved in {result_json}') # noqa: E501
continue
elif 'MMT-Bench_ALL' in dataset_name:
submission_file = MMTBench_result_transfer(result_file, **judge_kwargs)
logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}') # noqa: E501
continue
elif 'MLLMGuard_DS' in dataset_name:
logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501
continue
elif 'AesBench_TEST' == dataset_name:
logger.info(f'The results are saved in {result_file}. Please send it to the AesBench Team via [email protected].') # noqa: E501
continue
if dataset_name in [
'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN'
'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
]:
if not MMBenchOfficialServer(dataset_name):
logger.error(
f'Can not evaluate {dataset_name} on non-official servers, '
'will skip the evaluation. '
)
continue
if rank == 0 and args.mode == 'all':
eval_results = dataset.evaluate(result_file, **judge_kwargs)
if eval_results is not None:
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ')
logger.info('Evaluation Results:')
if isinstance(eval_results, dict):
logger.info('\n' + json.dumps(eval_results, indent=4))
elif isinstance(eval_results, pd.DataFrame):
if len(eval_results) < len(eval_results.columns):
eval_results = eval_results.T
logger.info('\n' + tabulate(eval_results))
if __name__ == '__main__':
load_env()
main()
the script just copy from git repo.
Can u identify is there anything wrong?
the tsv file I checked, it's looks normal, with answer field in header
@lucasjinreal Would you please try to reproduce this bug with the latest main branch of VLMEvalKit (with any officially supported VLM (that is recommended) or you can simply add the VLM you want to evaluate based on the latest main branch). I still can't not figure out what is the problem based on the provided log.
BTW, the log in your first comment seems not the full error log. Providing the full error log might also help.
Am using the latest, in fact, I tried all versions start from a certrain one, the error then keeping appears.
If any people can reproduce this error, please let me know. I can not reproduce the error with any officially supported command / script, please let me know.
let me rerun this with a fresh enviroment.
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
return self._engine.get_loc(casted_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'answer'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/data/mllm/run_vlmevalkit2.py", line 226, in <module>
main()
File "/data/mllm/run_vlmevalkit2.py", line 211, in main
eval_results = dataset.evaluate(result_file, **judge_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.local/lib/python3.12/site-packages/vlmeval/dataset/image_mcq.py", line 181, in evaluate
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
~~~~^^^^^^^^^^
File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/frame.py", line 4090, in __getitem__
indexer = self.columns.get_loc(key)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
raise KeyError(key) from err
KeyError: 'answer'
Got a simillar error:
>>> import vlmeval
vlmeval.Did not detect the .env file at /root/.local/lib/python3.12/site-packages/.env, failed to load.
>>> vlmeval.__version__
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: module 'vlmeval' has no attribute '__version__'
>>> vlmeval.__version__
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: module 'vlmeval' has no attribute '__version__'
Traceback (most recent call last): File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc return self._engine.get_loc(casted_key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item KeyError: 'answer' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/data/mllm/run_vlmevalkit2.py", line 226, in <module> main() File "/data/mllm/run_vlmevalkit2.py", line 211, in main eval_results = dataset.evaluate(result_file, **judge_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/.local/lib/python3.12/site-packages/vlmeval/dataset/image_mcq.py", line 181, in evaluate answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])} ~~~~^^^^^^^^^^ File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/frame.py", line 4090, in __getitem__ indexer = self.columns.get_loc(key) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc raise KeyError(key) from err KeyError: 'answer'Got a simillar error:
>>> import vlmeval vlmeval.Did not detect the .env file at /root/.local/lib/python3.12/site-packages/.env, failed to load. >>> vlmeval.__version__ Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: module 'vlmeval' has no attribute '__version__' >>> vlmeval.__version__ Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: module 'vlmeval' has no attribute '__version__'
Please tell me the exact command that caused the error so I can reproduce it.
I am runing MME as well. can u tell what could be going wrong from the log?
These error messages is very confusing to me, it doesn't tells what could be wrong, but definitly related to a certain dataset.
the command is:
torchrun --nproc-per-node=$num_gpus --master-port 2344 run_vlmevalkit2.py --data $eval_datasets --model minigemini_qwen_7b --verbose
the script used was exactly same from git repo
eval_datasets='MMBench_DEV_CN MMStar ChartQA_TEST Q-Bench1_VAL MMMU_VAL POPE HallusionBench RealWorldQA MMBench_TEST_EN MMBench_TEST_CN MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MME'
all the data produced result, except MME.
I am runing MME as well. can u tell what could be going wrong from the log?
These error messages is very confusing to me, it doesn't tells what could be wrong, but definitly related to a certain dataset.
the command is:
torchrun --nproc-per-node=$num_gpus --master-port 2344 run_vlmevalkit2.py --data $eval_datasets --model minigemini_qwen_7b --verbose
the script used was exactly same from git repo
eval_datasets='MMBench_DEV_CN MMStar ChartQA_TEST Q-Bench1_VAL MMMU_VAL POPE HallusionBench RealWorldQA MMBench_TEST_EN MMBench_TEST_CN MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MME'
all the data produced result, except MME.
@MonolithFoundation
I cannot debug with run_vlmevalkit2.py and minigemini_qwen_7b
cuz they are not an officially provided script and models. Would you please tell me if you can reproduce the error with the official run.py and an officially supported VLM?
Hi, the script is copied from github repo of https://github.com/open-compass/VLMEvalKit/blob/main/run.py without any modification. can u please don't use so large title to emphisis? It's waste too many spaces of my screen. Just dicussion not blaim your code.
Hi, the script is copied from github repo of https://github.com/open-compass/VLMEvalKit/blob/main/run.py without any modification. can u please don't use so large title to emphisis? It's waste too many spaces of my screen. Just dicussion not blaim your code.
I just think I really need to emphasize it to save communications. For example, I also emphasized that I need an officially supported VLM to debug with, please provide me a name.
@kennymckormick Hi, the model used is this:
class LLaVA(BaseModel):
INSTALL_REQ = True
def __init__(self,
model_pth='liuhaotian/llava_v1.5_7b',
**kwargs):
try:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
except Exception as e:
import traceback
trackback.print_exec()
warnings.warn(f'Please install llava before using LLaVA {e}')
sys.exit(-1)
warnings.warn('Please install the latest version of llava from github before you evaluate the LLaVA model. ')
assert osp.exists(model_pth) or splitlen(model_pth) == 2
if model_pth == 'Lin-Chen/ShareGPT4V-7B':
model_name = 'llava-v1.5-7b'
elif model_pth == 'Lin-Chen/ShareGPT4V-13B':
model_name = 'llava-v1.5-13b'
else:
model_name = get_model_name_from_path(model_pth)
print(f'------> model_name: {model_name}')
try:
self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
model_path=model_pth,
model_base=None,
model_name=model_name,
device='cpu',
device_map='cpu'
)
if self.tokenizer.pad_token_id == None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
except Exception as e:
if 'ShareGPT4V' in model_pth:
import llava
warnings.warn(
'Please manually remove the encoder type check in '
f'{llava.__path__[0]}/model/multimodal_encoder/builder.py '
'Line 8 to use the ShareGPT4V model. ')
else:
warnings.warn(f'Unknown error when loading LLaVA model. {e}')
exit(-1)
self.model = self.model.cuda()
# self.conv_mode = 'llava_v1'
self.conv_mode = 'qwen'
if 'llama-3' in model_pth or 'llama3' in model_pth:
self.conv_mode = 'llama3'
elif 'mistral' in model_pth.lower():
self.conv_mode = 'mistral'
kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
# return False
return True
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。' if cn_string(prompt) else
"\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
from llava.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
from llava.constants import (
IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
from llava.conversation import conv_templates, SeparatorStyle
prompt, image_path = self.message_to_promptimg(message)
image = Image.open(image_path).convert('RGB')
args = abstractproperty()
args.image_aspect_ratio = 'pad'
image_tensor = process_images([image], self.image_processor, args).to('cuda', dtype=torch.float16)
if self.model.config.mm_use_im_start_end:
inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
else:
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(
prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
# input_ids=input_ids, pixel_values=image_tensor, stopping_criteria=[stopping_criteria], **self.kwargs)
input_ids, image_tensor, stopping_criteria=[stopping_criteria], **self.kwargs)
output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return output
I opine that it is scarcely similar to Llava. I merely employed a name available on my end.
Could you have a look to determine if there is anything incorrect?
Please refrain from using head lines; you can employ bold for emphasis.
Personally I only help debug with officially supported features (models, benchmarks, etc. ). But I will keep the issue open to see if anyone can reproduce the bug with officially supported scripts and models, or there might be some community users that are willing to help.
@kennymckormick Do u saw any wrong configration in I posted model file? (the llava like)