Titans icon indicating copy to clipboard operation
Titans copied to clipboard

[BUG]

Open butyuhao opened this issue 1 year ago • 0 comments

🐛 Describe the bug

I wrote the following code, I think the dimentions of tensors are correct. What should I do? class LlamaMLP(nn.Module): def init( self, hidden_size: int, intermediate_size: int, hidden_act: str, ): super().init() # self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.gate_proj = col_nn.Linear(hidden_size, intermediate_size, dtype=torch.float, bias=False) # self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) self.down_proj = col_nn.Linear(intermediate_size, hidden_size, dtype=torch.float, bias=False) # self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.up_proj = col_nn.Linear(hidden_size, intermediate_size, dtype=torch.float, bias=False) self.act_fn = ACT2FN[hidden_act]

def forward(self, x):
    up = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
    down = self.down_proj(up)
    return down

the output I get is: Traceback (most recent call last): File "train.py", line 158, in layer_outputs = decoder_layer( File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/nas-alinlp/butyuhao/GLM/colossal-ai/llama/modeling_llama.py", line 348, in forward main() File "train.py", line 151, in main hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/nas-alinlp/butyuhao/GLM/colossal-ai/llama/modeling_llama.py", line 218, in forward engine.execute_schedule(data_iter, return_output_label=False) File "/opt/conda/lib/python3.8/site-packages/colossalai/engine/_base_engine.py", line 201, in execute_schedule output, label, loss = self._schedule.forward_backward_step(self, data_iter, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/engine/schedule/_non_pipeline_schedule.py", line 78, in forward_backward_step output = self._call_engine(engine, data) File "/opt/conda/lib/python3.8/site-packages/colossalai/engine/schedule/_base_schedule.py", line 109, in _call_engine return engine(inputs) File "/opt/conda/lib/python3.8/site-packages/colossalai/engine/_base_engine.py", line 186, in call return self.model(*args, **kwargs) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/nas-alinlp/butyuhao/GLM/colossal-ai/llama/modeling_llama.py", line 661, in forward qkv = self.query_key_value(hidden_states) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/_utils.py", line 41, in forward return self.module(*args) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/_utils.py", line 41, in forward return self.module(*args) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 697, in forward assert input.shape[-1] == self.weight.shape[-1],
AssertionError: Invalid shapes in Linear1D_Row forward: input=torch.Size([1, 128, 4096]), weight=torch.Size([12288, 2048]). Expected last dim of input 2048. layer_outputs = decoder_layer( File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/nas-alinlp/butyuhao/GLM/colossal-ai/llama/modeling_llama.py", line 348, in forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/nas-alinlp/butyuhao/GLM/colossal-ai/llama/modeling_llama.py", line 218, in forward qkv = self.query_key_value(hidden_states) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/_utils.py", line 41, in forward return self.module(*args) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in _call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/colossalai_layer/_utils.py", line 41, in forward return self.module(*args) File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1185, in call_impl return forward_call(*input, **kwargs) File "/opt/conda/lib/python3.8/site-packages/colossalai/nn/layer/parallel_1d/layers.py", line 697, in forward assert input.shape[-1] == self.weight.shape[-1],
AssertionError: Invalid shapes in Linear1D_Row forward: input=torch.Size([1, 128, 4096]), weight=torch.Size([12288, 2048]). Expected last dim of input 2048. ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 60198) of binary: /opt/conda/bin/python

Environment

TENSOR_PARALLEL_SIZE = 2 TENSOR_PARALLEL_MODE = '1d'

parallel = dict( pipeline=1, tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), )

batch_size=1

butyuhao avatar Apr 06 '23 09:04 butyuhao