ipex-llm
ipex-llm copied to clipboard
failed to inference latest version (67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984) of mpt-7b with BigDL
In latest commit, https://huggingface.co/mosaicml/mpt-7b/commit/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984, BigDL throws below error when generate text.
INFO 2024-02-20 06:41:05,962 proxy 172.17.0.2 0dfd2310-daba-4d40-8b27-6ccbbd608fd2 /mpt-7b-bigdl router.py:959 - Using router <class 'ray.serve._private.router.PowerOfTwoChoicesReplicaScheduler'>. INFO 2024-02-20 06:41:05,978 proxy 172.17.0.2 router.py:496 - Got updated replicas for deployment 'PredictorDeployment' in application 'mpt-7b-bigdl': {'mpt-7b-bigdl#PredictorDeployment#jBsKpA'}. ERROR 2024-02-20 06:41:06,031 proxy 172.17.0.2 0dfd2310-daba-4d40-8b27-6ccbbd608fd2 /mpt-7b-bigdl proxy.py:1045 - ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2, actor_id=55a8f57651e95e4bfc9348b101000000, repr=<ray.serve._private.replica.ServeReplica:mpt-7b-bigdl:PredictorDeployment object at 0x7fa349fc2d30>) async for result in generator: File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 338, in _handle_http_request_generator raise e from None File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 914, in call_user_method raise e from None ray.exceptions.RayTaskError: ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2) File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/utils.py", line 165, in wrap_to_ray_error raise exception File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 895, in call_user_method result = await method_to_call(*request_args, **request_kwargs) File "/root/llm-on-ray/inference/predictor_deployment.py", line 121, in call return self.predictor.generate(prompts, **config) File "/root/llm-on-ray/inference/transformer_predictor.py", line 122, in generate gen_tokens = self.model.generate( File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate return self.greedy_search( File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2362, in greedy_search outputs = self( File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 436, in forward outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 357, in forward (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/blocks.py", line 40, in forward (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) TypeError: mpt_multihead_attention_forward() got an unexpected keyword argument 'rotary_emb_w_meta_info' Traceback (most recent call last): File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy.py", line 979, in send_request_to_replica async for asgi_message_batch in response_generator: File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 111, in anext raise e from None File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 91, in anext result = await self._get_next_streaming_result() File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 134, in _get_next_streaming_result return next_result_task.result() File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 116, in _await_response_anext return await self._response.anext() File "/opt/conda/lib/python3.9/site-packages/ray/serve/handle.py", line 781, in anext return await next_obj_ref ray.exceptions.RayTaskError(TypeError): ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2, actor_id=55a8f57651e95e4bfc9348b101000000, repr=<ray.serve._private.replica.ServeReplica:mpt-7b-bigdl:PredictorDeployment object at 0x7fa349fc2d30>) async for result in generator: File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 338, in _handle_http_request_generator raise e from None File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 914, in call_user_method raise e from None ray.exceptions.RayTaskError: ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2) File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/utils.py", line 165, in wrap_to_ray_error raise exception File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 895, in call_user_method result = await method_to_call(*request_args, **request_kwargs) File "/root/llm-on-ray/inference/predictor_deployment.py", line 121, in call return self.predictor.generate(prompts, **config) File "/root/llm-on-ray/inference/transformer_predictor.py", line 122, in generate gen_tokens = self.model.generate( File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate return self.greedy_search( File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2362, in greedy_search outputs = self( File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 436, in forward outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 357, in forward (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/blocks.py", line 40, in forward (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) TypeError: mpt_multihead_attention_forward() got an unexpected keyword argument 'rotary_emb_w_meta_info'
Hi @jiafuzha , the error tells that an unexpected argument is passed to BigDL-wrapped forward of mpt attention, and this happened because BigDL currently only supports mosaicml/mpt-7b-chat and mosaicml/mpt-30b-chat that do not apply rotary embedding. Namely, they are different models from mpt-7b that applies rotary embedding.
I suggest to launch a standalone test following document here before ray deployment.
If you really want to enable mpt-7b model, try to insert the below lines and append rotary_emb_w_meta_info
argument, into mpt_multihead_attention_forward
function in bigdl/llm/transformers/models/mpt.py file of your environment:
if rotary_emb_w_meta_info is not None:
rotary_emb = rotary_emb_w_meta_info['rotary_emb']
seq_len = rotary_emb_w_meta_info['seq_len']
offset_info = rotary_emb_w_meta_info['offset_info']
bsz, seqlen = query.shape[:2]
query = query.view(bsz, seqlen, -1, self.head_dim)
key = key.view(bsz, seqlen, -1, self.head_dim)
if rotary_emb_w_meta_info['impl'] == 'dail':
value = value.view(bsz, seqlen, -1, self.head_dim)
kv = torch.stack([key, value], dim=2)
query, kv = rotary_emb(query,
kv,
seqlen_offset=offset_info,
max_seqlen=seq_len)
[key, value] = torch.unbind(kv, dim=2)
value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
elif rotary_emb_w_meta_info['impl'] == 'hf':
(cos, sin) = rotary_emb(value, seq_len)
if is_transformers_version_gte('4.36'):
query, key = apply_rotary_pos_emb(query,
key,
cos,
sin,
offset_info,
unsqueeze_dim=2)
else:
query = query.transpose(1, 2)
key = key.transpose(1, 2)
query, key = apply_rotary_pos_emb(query, key, cos, sin,
offset_info)
query = query.transpose(1, 2)
key = key.transpose(1, 2)
This is copied from mosaicml/llm-foundry repo.
Hi @jiafuzha , the error tells that an unexpected argument is passed to BigDL-wrapped forward of mpt attention, and this happened because BigDL currently only supports mosaicml/mpt-7b-chat and mosaicml/mpt-30b-chat that do not apply rotary embedding. Namely, they are different models from mpt-7b that applies rotary embedding.
I suggest to launch a standalone test following document here before ray deployment.
If you really want to enable mpt-7b model, try to insert the below lines and append
rotary_emb_w_meta_info
argument, intompt_multihead_attention_forward
function in bigdl/llm/transformers/models/mpt.py file of your environment:if rotary_emb_w_meta_info is not None: rotary_emb = rotary_emb_w_meta_info['rotary_emb'] seq_len = rotary_emb_w_meta_info['seq_len'] offset_info = rotary_emb_w_meta_info['offset_info'] bsz, seqlen = query.shape[:2] query = query.view(bsz, seqlen, -1, self.head_dim) key = key.view(bsz, seqlen, -1, self.head_dim) if rotary_emb_w_meta_info['impl'] == 'dail': value = value.view(bsz, seqlen, -1, self.head_dim) kv = torch.stack([key, value], dim=2) query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len) [key, value] = torch.unbind(kv, dim=2) value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim) elif rotary_emb_w_meta_info['impl'] == 'hf': (cos, sin) = rotary_emb(value, seq_len) if is_transformers_version_gte('4.36'): query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2) else: query = query.transpose(1, 2) key = key.transpose(1, 2) query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info) query = query.transpose(1, 2) key = key.transpose(1, 2)
This is copied from mosaicml/llm-foundry repo.
thanks for detailed info. Let me change the model.
Hi @jiafuzha , the error tells that an unexpected argument is passed to BigDL-wrapped forward of mpt attention, and this happened because BigDL currently only supports mosaicml/mpt-7b-chat and mosaicml/mpt-30b-chat that do not apply rotary embedding. Namely, they are different models from mpt-7b that applies rotary embedding.
I suggest to launch a standalone test following document here before ray deployment.
If you really want to enable mpt-7b model, try to insert the below lines and append
rotary_emb_w_meta_info
argument, intompt_multihead_attention_forward
function in bigdl/llm/transformers/models/mpt.py file of your environment:if rotary_emb_w_meta_info is not None: rotary_emb = rotary_emb_w_meta_info['rotary_emb'] seq_len = rotary_emb_w_meta_info['seq_len'] offset_info = rotary_emb_w_meta_info['offset_info'] bsz, seqlen = query.shape[:2] query = query.view(bsz, seqlen, -1, self.head_dim) key = key.view(bsz, seqlen, -1, self.head_dim) if rotary_emb_w_meta_info['impl'] == 'dail': value = value.view(bsz, seqlen, -1, self.head_dim) kv = torch.stack([key, value], dim=2) query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len) [key, value] = torch.unbind(kv, dim=2) value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim) elif rotary_emb_w_meta_info['impl'] == 'hf': (cos, sin) = rotary_emb(value, seq_len) if is_transformers_version_gte('4.36'): query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2) else: query = query.transpose(1, 2) key = key.transpose(1, 2) query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info) query = query.transpose(1, 2) key = key.transpose(1, 2)
This is copied from mosaicml/llm-foundry repo.
Shall we add this support to BigDL?
As verified, rotray_emb_w_meta_info also added to mpt-7b-chat.
File "/root/llm-on-ray/inference/predictor_deployment.py", line 121, in call return self.predictor.generate(prompts, **config) File "/root/llm-on-ray/inference/transformer_predictor.py", line 122, in generate gen_tokens = self.model.generate( File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate return self.greedy_search( File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2362, in greedy_search outputs = self( File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b-chat/1a1d410c70591fcc1a46486a254cd0e600e7b1b4/modeling_mpt.py", line 436, in forward outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b-chat/1a1d410c70591fcc1a46486a254cd0e600e7b1b4/modeling_mpt.py", line 357, in forward (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b-chat/1a1d410c70591fcc1a46486a254cd0e600e7b1b4/blocks.py", line 40, in forward (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) TypeError: mpt_multihead_attention_forward() got an unexpected keyword argument 'rotary_emb_w_meta_info'
Hi @jiafuzha , the error tells that an unexpected argument is passed to BigDL-wrapped forward of mpt attention, and this happened because BigDL currently only supports mosaicml/mpt-7b-chat and mosaicml/mpt-30b-chat that do not apply rotary embedding. Namely, they are different models from mpt-7b that applies rotary embedding. I suggest to launch a standalone test following document here before ray deployment. If you really want to enable mpt-7b model, try to insert the below lines and append
rotary_emb_w_meta_info
argument, intompt_multihead_attention_forward
function in bigdl/llm/transformers/models/mpt.py file of your environment:if rotary_emb_w_meta_info is not None: rotary_emb = rotary_emb_w_meta_info['rotary_emb'] seq_len = rotary_emb_w_meta_info['seq_len'] offset_info = rotary_emb_w_meta_info['offset_info'] bsz, seqlen = query.shape[:2] query = query.view(bsz, seqlen, -1, self.head_dim) key = key.view(bsz, seqlen, -1, self.head_dim) if rotary_emb_w_meta_info['impl'] == 'dail': value = value.view(bsz, seqlen, -1, self.head_dim) kv = torch.stack([key, value], dim=2) query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len) [key, value] = torch.unbind(kv, dim=2) value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim) elif rotary_emb_w_meta_info['impl'] == 'hf': (cos, sin) = rotary_emb(value, seq_len) if is_transformers_version_gte('4.36'): query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2) else: query = query.transpose(1, 2) key = key.transpose(1, 2) query, key = apply_rotary_pos_emb(query, key, cos, sin, offset_info) query = query.transpose(1, 2) key = key.transpose(1, 2)
This is copied from mosaicml/llm-foundry repo.
Shall we add this support to BigDL?
That will be great considering mpt-7b-chat has the same issue.
Hi @jiafuzha , pls take a wait and I am WIP to support the feature.
Hi @jiafuzha , pls take a wait and I am WIP to support the feature.
take you time. thanks.
Hi @jiafuzha , rotary embedding has been enabled for MPT in #10208 , you can upgrade bigdl-llm
in your environment by pip install --pre --upgrade bigdl-llm[all]
.
let me try. thanks.