ipex-llm Run InternLM2 , reports error:TypeError: internlm2_attention_forward() got an unexpected keyword argument 'cache

When ran InternLM2 inference , it reported errors as below:

oneAPI :2024.0.1.46 ipex-llm: 2.1.0b2 transformers: 4.37.2 ,4.38.2

TypeError Traceback (most recent call last) Cell In[1], line 33 31 input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') 32 # ipex_llm model needs a warmup, then inference time can be accurate ---> 33 output = model.generate(input_ids,max_new_tokens=32) 34 # if your selected model is capable of utilizing previous key/value attentions 35 # to enhance decoding speed, but has "use_cache": false in its model config, 36 # it is important to set use_cache=True explicitly in the generate function 37 # to obtain optimal performance with IPEX-LLM INT4 optimizations 38 output = model.generate(input_ids,max_new_tokens=32)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/ipex_llm/transformers/lookup.py:88, in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs) 77 logger.warning("Since you call the generate with lookahead parameter, " 78 f"Speculative decoding parameters {spec_params} are " 79 "removed in the generation.") 80 return self.lookup_generate(inputs=inputs, 81 num_output_tokens=lookahead, 82 generation_config=generation_config, (...) 85 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 86 **kwargs) ---> 88 return original_generate(self, 89 inputs=inputs, 90 generation_config=generation_config, 91 logits_processor=logits_processor, 92 stopping_criteria=stopping_criteria, 93 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 94 synced_gpus=synced_gpus, 95 assistant_model=assistant_model, 96 streamer=streamer, 97 **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/ipex_llm/transformers/speculative.py:109, in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs) 105 for var in ['max_step_draft', 'th_stop_draft', 'hf_adjust', 106 'auto_th_stop_draft', 'auto_parameters', 'min_step_draft', 107 'th_batch_num']: 108 kwargs.pop(var, None) --> 109 return original_generate(self, 110 inputs=inputs, 111 generation_config=generation_config, 112 logits_processor=logits_processor, 113 stopping_criteria=stopping_criteria, 114 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 115 synced_gpus=synced_gpus, 116 assistant_model=assistant_model, 117 streamer=streamer, 118 **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/ipex_llm/transformers/pipeline_parallel.py:280, in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs) 273 max_new_tokens = kwargs.pop("max_new_tokens", None) 275 return self.pipeline_parallel_generate(inputs=inputs, 276 max_new_tokens=max_new_tokens, 277 generation_config=generation_config, 278 **kwargs) --> 280 return original_generate(self, 281 inputs=inputs, 282 generation_config=generation_config, 283 logits_processor=logits_processor, 284 stopping_criteria=stopping_criteria, 285 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, 286 synced_gpus=synced_gpus, 287 assistant_model=assistant_model, 288 streamer=streamer, 289 **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/transformers/generation/utils.py:1544, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs) 1526 return self.assisted_decoding( 1527 input_ids, 1528 candidate_generator=candidate_generator, (...) 1540 **model_kwargs, 1541 ) 1542 if generation_mode == GenerationMode.GREEDY_SEARCH: 1543 # 11. run greedy search -> 1544 return self.greedy_search( 1545 input_ids, 1546 logits_processor=prepared_logits_processor, 1547 stopping_criteria=prepared_stopping_criteria, 1548 pad_token_id=generation_config.pad_token_id, 1549 eos_token_id=generation_config.eos_token_id, 1550 output_scores=generation_config.output_scores, 1551 output_logits=generation_config.output_logits, 1552 return_dict_in_generate=generation_config.return_dict_in_generate, 1553 synced_gpus=synced_gpus, 1554 streamer=streamer, 1555 **model_kwargs, 1556 ) 1558 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH: 1559 if not model_kwargs["use_cache"]:

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/transformers/generation/utils.py:2404, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs) 2401 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) 2403 # forward pass to get next token -> 2404 outputs = self( 2405 **model_inputs, 2406 return_dict=True, 2407 output_attentions=output_attentions, 2408 output_hidden_states=output_hidden_states, 2409 ) 2411 if synced_gpus and this_peer_finished: 2412 continue # don't waste resources running the code we don't need

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None

File ~/.cache/huggingface/modules/transformers_modules/internlm2-chat-7b/modeling_internlm2.py:1204, in InternLM2ForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) 1201 return_dict = return_dict if return_dict is not None else self.config.use_return_dict 1203 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) -> 1204 outputs = self.model( 1205 input_ids=input_ids, 1206 attention_mask=attention_mask, 1207 position_ids=position_ids, 1208 past_key_values=past_key_values, 1209 inputs_embeds=inputs_embeds, 1210 use_cache=use_cache, 1211 output_attentions=output_attentions, 1212 output_hidden_states=output_hidden_states, 1213 return_dict=return_dict, 1214 cache_position=cache_position, 1215 ) 1217 hidden_states = outputs[0] 1218 if self.config.pretraining_tp > 1:

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None

File ~/.cache/huggingface/modules/transformers_modules/internlm2-chat-7b/modeling_internlm2.py:1004, in InternLM2Model.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) 993 layer_outputs = self._gradient_checkpointing_func( 994 decoder_layer.call, 995 hidden_states, (...) 1001 cache_position, 1002 ) 1003 else: -> 1004 layer_outputs = decoder_layer( 1005 hidden_states, 1006 attention_mask=causal_mask, 1007 position_ids=position_ids, 1008 past_key_value=past_key_values, 1009 output_attentions=output_attentions, 1010 use_cache=use_cache, 1011 cache_position=cache_position, 1012 ) 1014 hidden_states = layer_outputs[0] 1016 if use_cache:

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None

File ~/.cache/huggingface/modules/transformers_modules/internlm2-chat-7b/modeling_internlm2.py:738, in InternLM2DecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position) 735 hidden_states = self.attention_norm(hidden_states) 737 # Self Attention --> 738 hidden_states, self_attn_weights, present_key_value = self.attention( 739 hidden_states=hidden_states, 740 attention_mask=attention_mask, 741 position_ids=position_ids, 742 past_key_value=past_key_value, 743 output_attentions=output_attentions, 744 use_cache=use_cache, 745 cache_position=cache_position, 746 ) 747 hidden_states = residual + hidden_states 749 # Fully Connected

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs)

File ~/.miniconda_dev_zone/envs/notebook-zone/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None

TypeError: internlm2_attention_forward() got an unexpected keyword argument 'cache_position'

====================================================================================

pip list

Package Version

accelerate 0.23.0 addict 2.4.0 aiofiles 23.2.1 aiohttp 3.9.5 aiosignal 1.3.1 aliyun-python-sdk-core 2.15.1 aliyun-python-sdk-kms 2.16.3 altair 5.3.0 annotated-types 0.7.0 antlr4-python3-runtime 4.9.3 anyio 4.4.0 asttokens 2.4.1 async-timeout 4.0.3 attrs 24.1.0 bigdl-core-xe-21 2.1.0b2 bigdl-core-xe-addons-21 2.1.0b2 bigdl-core-xe-batch-21 2.1.0b2 bitsandbytes 0.43.1 certifi 2024.7.4 cffi 1.16.0 charset-normalizer 3.3.2 click 8.1.7 comm 0.2.2 contourpy 1.2.1 crcmod 1.7 cryptography 42.0.8 cycler 0.12.1 datasets 2.20.0 debugpy 1.8.1 decorator 5.1.1 dill 0.3.8 dnspython 2.6.1 docstring_parser 0.16 einops 0.8.0 email_validator 2.1.2 eval_type_backport 0.2.0 exceptiongroup 1.2.1 executing 2.0.1 fastapi 0.111.0 fastapi-cli 0.0.4 ffmpy 0.3.2 filelock 3.15.4 fonttools 4.53.0 frozenlist 1.4.1 fsspec 2024.6.1 gast 0.5.4 gguf 0.6.0 gradio 4.36.1 gradio_client 1.0.1 h11 0.14.0 httpcore 1.0.5 httptools 0.6.1 httpx 0.27.0 huggingface-hub 0.24.5 idna 3.7 importlib_metadata 7.1.0 importlib_resources 6.4.0 intel-cmplr-lib-ur 2024.2.1 intel-extension-for-pytorch 2.1.10+xpu intel-openmp 2024.2.1 ipex-llm 2.1.0b2 ipykernel 6.29.4 ipython 8.18.1 ipywidgets 8.1.3 jedi 0.19.1 Jinja2 3.1.4 jmespath 0.10.0 joblib 1.4.2 jsonschema 4.22.0 jsonschema-specifications 2023.12.1 jupyter_client 8.6.2 jupyter_core 5.7.2 jupyterlab_widgets 3.0.11 kiwisolver 1.4.5 kornia 0.7.3 kornia_rs 0.1.5 latex2mathml 3.77.0 Markdown 3.6 markdown-it-py 3.0.0 MarkupSafe 2.1.5 matplotlib 3.9.0 matplotlib-inline 0.1.7 mdtex2html 1.3.0 mdurl 0.1.2 modelscope 1.11.0 mpmath 1.3.0 multidict 6.0.5 multiprocess 0.70.16 nest-asyncio 1.6.0 networkx 3.2.1 nltk 3.8.1 numpy 1.26.4 omegaconf 2.3.0 orjson 3.10.5 oss2 2.18.6 packaging 24.1 pandas 2.2.2 parso 0.8.4 peft 0.12.0 pexpect 4.9.0 pillow 10.3.0 pip 24.2 platformdirs 4.2.2 prompt_toolkit 3.0.47 protobuf 4.25.3 psutil 6.0.0 ptyprocess 0.7.0 pure-eval 0.2.2 py-cpuinfo 9.0.0 pyarrow 16.1.0 pyarrow-hotfix 0.6 pycparser 2.22 pycryptodome 3.20.0 pydantic 2.7.4 pydantic_core 2.18.4 pydub 0.25.1 Pygments 2.18.0 pyparsing 3.1.2 python-dateutil 2.9.0.post0 python-dotenv 1.0.1 python-multipart 0.0.9 pytz 2024.1 PyYAML 6.0.1 pyzmq 26.0.3 referencing 0.35.1 regex 2024.7.24 requests 2.32.3 rich 13.7.1 rpds-py 0.18.1 ruff 0.4.9 safetensors 0.4.4 scikit-learn 1.5.0 scipy 1.13.1 semantic-version 2.10.0 sentence-transformers 2.3.1 sentencepiece 0.2.0 setuptools 69.5.1 shellingham 1.5.4 shtab 1.7.1 simplejson 3.19.2 six 1.16.0 sniffio 1.3.1 sortedcontainers 2.4.0 spandrel 0.3.4 sse-starlette 2.1.3 stack-data 0.6.3 starlette 0.37.2 sympy 1.12.1 tabulate 0.9.0 threadpoolctl 3.5.0 tiktoken 0.7.0 timm 1.0.7 tokenizers 0.15.2 tomli 2.0.1 tomlkit 0.12.0 toolz 0.12.1 torch 2.1.0a0+cxx11.abi torchsde 0.2.6 torchvision 0.16.0a0+cxx11.abi tornado 6.4.1 tqdm 4.66.5 traitlets 5.14.3 trampoline 0.1.2 transformers 4.38.2 transformers-stream-generator 0.0.5 triton 3.0.0 trl 0.9.6 typer 0.12.3 typing_extensions 4.12.2 tyro 0.8.5 tzdata 2024.1 ujson 5.10.0 urllib3 2.2.2 uvicorn 0.30.1 uvloop 0.19.0 viola 0.3.8 watchfiles 0.22.0 wcwidth 0.2.13 websockets 11.0.3 wheel 0.44.0 widgetsnbextension 4.0.11 xxhash 3.4.1 yapf 0.40.2 yarl 1.9.4 zipp 3.19.2

Aug 08 '24 08:08 johnysh

We have reproduced this issue and looking for solutions. Will keep you updated when it is resolved.

Aug 09 '24 06:08 hkvision

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             load_in_4bit=True,
                                             trust_remote_code=True,
                                             use_cache=True)

 I set “load_in_4bit=False” and made the model running , but the speed turned slow and hoped it could help you to find a direction.

From: Kai Huang @.> Sent: Friday, August 9, 2024 2:29 PM To: intel-analytics/ipex-llm @.> Cc: Shi, Junhan @.>; Author @.> Subject: Re: [intel-analytics/ipex-llm] Run InternLM2 , reports error:TypeError: internlm2_attention_forward() got an unexpected keyword argument 'cache_position' (Issue #11744)

We have reproduced this issue and looking for solutions. Will keep you updated when it is resolved.

— Reply to this email directly, view it on GitHubhttps://github.com/intel-analytics/ipex-llm/issues/11744#issuecomment-2277237806, or unsubscribehttps://github.com/notifications/unsubscribe-auth/ADXBVT74VHAWVICXDZHYFPTZQROTVAVCNFSM6AAAAABMF7PWLWVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDENZXGIZTOOBQGY. You are receiving this because you authored the thread.Message ID: @.@.>>

Aug 09 '24 06:08 johnysh

Hi, after checking with our team, for internlm2, we only support v1.1.0 of this model: https://huggingface.co/internlm/internlm2-chat-7b/tree/v1.1.0, you can download this model version following the guide here: https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HuggingFace/LLM/internlm2#4-running-examples

Since internlm has already released newer models, internlm2.5 would be our focus to support. Feel free to tell us if you wish to use internlm2.5 as well :)

Aug 12 '24 02:08 hkvision

Follow this bkc ,and add revision="v1.1.0" ,it can run well. Thx

From: Kai Huang @.> Sent: Monday, August 12, 2024 10:48 AM To: intel-analytics/ipex-llm @.> Cc: Shi, Junhan @.>; Author @.> Subject: Re: [intel-analytics/ipex-llm] Run InternLM2 , reports error:TypeError: internlm2_attention_forward() got an unexpected keyword argument 'cache_position' (Issue #11744)

Hi, after checking with our team, for internlm2, we only support v1.1.0 of this model: https://huggingface.co/internlm/internlm2-chat-7b/tree/v1.1.0, you can download this model version following the guide here: https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HuggingFace/LLM/internlm2#4-running-examples

Since internlm has already released newer models, internlm2.5 would be our focus to support. Feel free to tell us if you wish to use internlm2.5 as well :)

— Reply to this email directly, view it on GitHubhttps://github.com/intel-analytics/ipex-llm/issues/11744#issuecomment-2283028647, or unsubscribehttps://github.com/notifications/unsubscribe-auth/ADXBVT2PWG7PSEQW3A3F7D3ZRAO4TAVCNFSM6AAAAABMF7PWLWVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOBTGAZDQNRUG4. You are receiving this because you authored the thread.Message ID: @.@.>>

Aug 12 '24 08:08 johnysh

Glad to hear that it works!

Aug 12 '24 08:08 hkvision