[Bug]: MiniCPMAttention类is_causal不起作用
Is there an existing issue ? / 是否已有相关的 issue ?
- [X] I have searched, and there is no existing issue. / 我已经搜索过了,没有相关的 issue。
Describe the bug / 描述这个 bug
https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/blob/main/modeling_minicpm.py 中,有:
`
class MiniCPMAttention(nn.Module):
| """Multi-headed attention from 'Attention Is All You Need' paper"""
|
| def init(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
| super().init()
| self.config = config
| self.layer_idx = layer_idx
| if layer_idx is None:
| logger.warning_once(
| f"Instantiating {self.class.name} without passing layer_idx is not recommended and will "
| "to errors during the forward call, if caching is used. Please make sure to provide a layer_idx "
| "when creating this class."
| )
|
| self.attention_dropout = config.attention_dropout
| self.hidden_size = config.hidden_size
| self.num_heads = config.num_attention_heads
| self.head_dim = self.hidden_size // self.num_heads
| self.num_key_value_heads = config.num_key_value_heads
| self.num_key_value_groups = self.num_heads // self.num_key_value_heads
| self.max_position_embeddings = config.max_position_embeddings
| self.rope_theta = config.rope_theta
| self.is_causal = True
|
| if (self.head_dim * self.num_heads) != self.hidden_size:
| raise ValueError(
| f"hidden_size must be divisible by num_heads (got hidden_size: {self.hidden_size}"
| f" and num_heads: {self.num_heads})."
| )
|
| self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
| self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
| self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
| self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
| self._init_rope()
|
| def _init_rope(self):
| if self.config.rope_scaling is None:
| self.rotary_emb = MiniCPMRotaryEmbedding(
| self.head_dim,
| max_position_embeddings=self.max_position_embeddings,
| base=self.rope_theta,
| )
| else:
| scaling_type = self.config.rope_scaling["type"]
| scaling_factor = self.config.rope_scaling["factor"]
| if scaling_type == "linear":
| self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
| self.head_dim,
| max_position_embeddings=self.max_position_embeddings,
| scaling_factor=scaling_factor,
| base=self.rope_theta,
| )
| elif scaling_type == "dynamic":
| self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding(
| self.head_dim,
| max_position_embeddings=self.max_position_embeddings,
| scaling_factor=scaling_factor,
| base=self.rope_theta,
| )
| else:
| raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
| def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
| return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
| def forward(
| self,
| hidden_states: torch.Tensor,
| attention_mask: Optional[torch.Tensor] = None,
| position_ids: Optional[torch.LongTensor] = None,
| past_key_value: Optional[Cache] = None,
| output_attentions: bool = False,
| use_cache: bool = False,
| **kwargs,
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
| if "padding_mask" in kwargs:
| warnings.warn(
| "Passing padding_mask is deprecated and will be removed in v4.37. Please make sure use attention_mask instead." | ) | | bsz, q_len, _ = hidden_states.size() | | if self.config.pretraining_tp > 1: | key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp | query_slices = self.q_proj.weight.split( | (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 | ) | key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) | value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) | | query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] | query_states = torch.cat(query_states, dim=-1) | | key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] | key_states = torch.cat(key_states, dim=-1) | | value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] | value_states = torch.cat(value_states, dim=-1) | | else: | query_states = self.q_proj(hidden_states) | key_states = self.k_proj(hidden_states) | value_states = self.v_proj(hidden_states) | | query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) | key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) | value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) | | kv_seq_len = key_states.shape[-2] | if past_key_value is not None: | if self.layer_idx is None: | raise ValueError( | f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " | "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " | "with a layer index." | ) | kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) | cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) | | query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) | | if past_key_value is not None: | cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models | key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) | | key_states = repeat_kv(key_states, self.num_key_value_groups) | value_states = repeat_kv(value_states, self.num_key_value_groups) | | attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) | if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): | raise ValueError( | f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" | f" {attn_weights.size()}" | ) | | if attention_mask is not None: | if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): | raise ValueError( | f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" | ) | attn_weights = attn_weights + attention_mask | | # upcast attention to fp32 | attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) | attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) | attn_output = torch.matmul(attn_weights, value_states) | | if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): | raise ValueError( | f"attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
| f" {attn_output.size()}"
| )
|
| attn_output = attn_output.transpose(1, 2).contiguous()
|
| attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
| if self.config.pretraining_tp > 1:
| attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
| o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
| attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
| else:
| attn_output = self.o_proj(attn_output)
|
| if not output_attentions:
| attn_weights = None
|
| return attn_output, attn_weights, past_key_value
`
可见,虽然定义了is_causal参数在init中,但实际forward并没有使用is_causal,只能在启用flash_attention_2下考虑到,所以这是个bug,如果没有flash_attention,并且希望使用双向注意力,就会出现问题。
To Reproduce / 如何复现
使用eager attention,且指定is_causal=True
Expected behavior / 期望的结果
应该得到双向注意力结果,但实际上只有causal注意力结果
Screenshots / 截图
No response
Environment / 环境
- OS: [e.g. Ubuntu 18.04]
- Pytorch: [e.g. torch 2.0.0]
- CUDA: [e.g. CUDA 11.6]
- Device: [e.g. A100-SXM-80G]
Additional context / 其他信息
No response