past_length is None
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1271, in call_hook
getattr(hook, fn_name)(self, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 230, in before_train
self._generate_samples(runner, max_new_tokens=50)
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 216, in _generate_samples
self._eval_images(runner, model, device, max_new_tokens,
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 148, in _eval_images
generation_output = model.generate(
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1758, in generate
result = self._sample(
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 2390, in _sample
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1321, in _get_initial_cache_position
past_length = model_kwargs["past_key_values"][0][0].shape[2]
TypeError: 'NoneType' object is not subscriptable
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
main()
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
runner.train()
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1200, in train
model = self.train_loop.run() # type: ignore
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/runner/loops.py", line 271, in run
self.runner.call_hook('before_train')
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1273, in call_hook
raise TypeError(f'{e} in {hook}') from e
TypeError: 'NoneType' object is not subscriptable in <xtuner.engine.hooks.evaluate_chat_hook.EvaluateChatHook object at 0x7f5bccfe50c0>
[2024-05-27 19:17:30,464] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 2618) of binary: /root/miniconda3/bin/python
Traceback (most recent call last):
File "/root/miniconda3/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/root/miniconda3/lib/python3.10/site-packages/xtuner/tools/train.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
Any help would be greatly appreciated!
@vincent507cpu Can you provide more information? For example, configuration files, whether any changes were made, etc.
@hhaAndroid Thank you for your help. I only changed Part 1:
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
llm_name_or_path = '/root/autodl-tmp/Models/Meta-Llama-3-8B-Instruct'
visual_encoder_name_or_path = '/root/autodl-tmp/Models/clip-vit-large-patch14-336'
# Data
data_root = '/root/autodl-tmp/'
data_path = data_root + 'Data/model_finetuning/llava_finetune_pretrain.json'
image_folder = data_root + 'Data/MultiModalQA/final_dataset_images'
prompt_template = PROMPT_TEMPLATE.llama3_chat
max_length = int(2048 - (336 / 14)**2)
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 256
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 1e-3
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 50000
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 50000
SYSTEM = ''
evaluation_images = image_folder + '/6ce6de9d60f9b58cd4f925db4642f96b.jpg'
evaluation_inputs = ['Please describe this picture']
I'm using MultimodalQA dataset, text (image caption) is generalized by ChatGPT-4o.
Hi @hhaAndroid, finetune has a similar error.
- LLM: meta-llama/Meta-Llama-3-8B-Instruct (downloaded to local folder)
- image encoder: openai/clip-vit-large-patch14-336 (downloaded to local folder)
- projector: xtuner/llava-llama-3-8b-v1_1-pretrain (downloaded to local folder)
- command:
NPROC_PER_NODE=1 xtuner train /root/autodl-tmp/GitHub/xtuner/xtuner/configs/llava/llama3_8b_instruct_clip_vit_large_p14_336/finetune/llava_llama3_8b_instruct_qlora_clip_vit_large_p14_336_e1_gpu1_finetune.py --deepspeed deepspeed_zero3 - config change:
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
llm_name_or_path = '/root/autodl-tmp/Models/Meta-Llama-3-8B-Instruct'
visual_encoder_name_or_path = '/root/autodl-tmp/Models/clip-vit-large-patch14-336'
# Specify the pretrained pth
pretrained_pth = '/root/autodl-tmp/Models/llava-llama-3-8b-v1_1-pretrain/iter_9742.pth' # noqa: E501
# Data
data_root = '/root/autodl-tmp/'
data_path = data_root + 'Data/model_finetuning/llava_finetune_pretrain.json'
image_folder = data_root + 'Data/MultiModalQA/final_dataset_images'
prompt_template = PROMPT_TEMPLATE.llama3_chat
max_length = int(2048 - (336 / 14)**2)
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 128
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 50000
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 50000
SYSTEM = ''
# evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg'
evaluation_images = image_folder + '/6ce6de9d60f9b58cd4f925db4642f96b.jpg'
evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture']
- error message:
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1271, in call_hook
getattr(hook, fn_name)(self, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 230, in before_train
self._generate_samples(runner, max_new_tokens=50)
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 216, in _generate_samples
self._eval_images(runner, model, device, max_new_tokens,
File "/root/miniconda3/lib/python3.10/site-packages/xtuner/engine/hooks/evaluate_chat_hook.py", line 148, in _eval_images
generation_output = model.generate(
File "/root/miniconda3/lib/python3.10/site-packages/peft/peft_model.py", line 1491, in generate
outputs = self.base_model.generate(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1758, in generate
result = self._sample(
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 2390, in _sample
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py", line 1321, in _get_initial_cache_position
past_length = model_kwargs["past_key_values"][0][0].shape[2]
TypeError: 'NoneType' object is not subscriptable
If you have a chance, please look into it. Thank you very much!
可以看看这个,https://github.com/InternLM/xtuner/issues/834 transformers版本的问题,我安装4.39.1,问题就解决了 @vincent507cpu
@KimWu1994 非常感谢!