🐛 Describe the bug

It seems that the embedding weight don't assignment when I package the model with geminidpp. The model works when I init with from_pretrained function, but failed with geminidpp.

still work and params on cpu

with ColoInitContext(device=init_dev): model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, trust_remote_code=True )

not work and params on gpu

model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)

│ ❱ 582 │ outputs = model(**input_ids) │ │ 583 │ outputs = model.module.generate(**input_ids, **gen_kwargs) │ │ 584 │ outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]) - 2:] │ │ 585 │ response = tokenizer.decode(outputs) │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:276 in forward │ │ │ │ 273 │ │ self.module.zero_grad(set_to_none=True) │ │ 274 │ │ self.gemini_manager.pre_iter(*args) │ │ 275 │ │ with ColoParamOpHookManager.use_hooks(self.param_op_hook): │ │ ❱ 276 │ │ │ outputs = self.module(*args, **kwargs) │ │ 277 │ │ if self.force_outputs_fp32: │ │ 278 │ │ │ return _cast_float(outputs, torch.float) │ │ 279 │ │ return outputs │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/local/modeling_chatglm.py:1018 in forward │ │ │ │ 1015 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │ │ 1016 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 1017 │ │ │ │ ❱ 1018 │ │ transformer_outputs = self.transformer( │ │ 1019 │ │ │ input_ids=input_ids, │ │ 1020 │ │ │ position_ids=position_ids, │ │ 1021 │ │ │ attention_mask=attention_mask, │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/local/modeling_chatglm.py:838 in forward │ │ │ │ 835 │ │ │ │ ) │ │ 836 │ │ │ │ 837 │ │ if inputs_embeds is None: │ │ ❱ 838 │ │ │ inputs_embeds = self.word_embeddings(input_ids) │ │ 839 │ │ │ │ 840 │ │ # [seq_len, batch, hidden_size] │ │ 841 │ │ hidden_states = inputs_embeds.transpose(0, 1) │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or global_backward_hooks: │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/sparse.py:158 in forward │ │ │ │ 155 │ │ │ │ self.weight[self.padding_idx].fill(0) │ │ 156 │ │ │ 157 │ def forward(self, input: Tensor) -> Tensor: │ │ ❱ 158 │ │ return F.embedding( │ │ 159 │ │ │ input, self.weight, self.padding_idx, self.max_norm, │ │ 160 │ │ │ self.norm_type, self.scale_grad_by_freq, self.sparse) │ │ 161 │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/functional.py:2168 in embedding │ │ │ │ 2165 │ """ │ │ 2166 │ │ │ 2167 │ if has_torch_function_variadic(input, weight): │ │ ❱ 2168 │ │ return handle_torch_function( │ │ 2169 │ │ │ embedding, │ │ 2170 │ │ │ (input, weight), │ │ 2171 │ │ │ input, │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/overrides.py:1498 in handle_torch_function │ │ │ │ 1495 │ │ │ │ 1496 │ │ # Use public_api instead of implementation so torch_function │ │ 1497 │ │ # implementations can do equality/identity comparisons. │ │ ❱ 1498 │ │ result = torch_func_method(public_api, types, args, kwargs) │ │ 1499 │ │ │ │ 1500 │ │ if result is not NotImplemented: │ │ 1501 │ │ │ return result │ │ │ │ /opt/conda/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py:183 in │ │ torch_function │ │ │ │ 180 │ │ │ │ return backward_tensor.backward(**tensor_kwargs) │ │ 181 │ │ │ │ 182 │ │ with torch._C.DisableTorchFunction(): │ │ ❱ 183 │ │ │ ret = func(*args, **kwargs) │ │ 184 │ │ │ if func in _get_my_nowrap_functions(): │ │ 185 │ │ │ │ return ret │ │ 186 │ │ │ else: │ │ │ │ /opt/conda/lib/python3.9/site-packages/colossalai/nn/ops/embedding.py:116 in colo_embedding │ │ │ │ 113 │ │ │ 114 │ if not weight.has_compute_spec(): # No Model Parallel Applied │ │ 115 │ │ assert weight.is_replicate(), 'Invalid weight spec for native embedding op' │ │ ❱ 116 │ │ return ColoTensor.from_torch_tensor(tensor=F.embedding(input_tensor, │ │ 117 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ weight, │ │ 118 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ padding_idx=padding_idx, │ │ 119 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ max_norm=max_norm, │ │ │ │ /opt/conda/lib/python3.9/site-packages/torch/nn/functional.py:2199 in embedding │ │ │ │ 2196 │ │ # torch.embedding_renorm │ │ 2197 │ │ # remove once script supports set_grad_enabled │ │ 2198 │ │ no_grad_embedding_renorm(weight, input, max_norm, norm_type) │ │ ❱ 2199 │ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) │ │ 2200 │ │ 2201 │ │ 2202 def embedding_bag( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.

Environment

No response

Mar 16 '23 10:03 TexasRangers86

Can you take a look at this issue #2487 ? Maybe it helps.

Mar 16 '23 12:03 JThh

Thanks a lot! I'll test.

Mar 16 '23 14:03 TexasRangers86

I solved it by update colossal's version to 0.2.7.

Mar 21 '23 07:03 TexasRangers86

ColossalAI
ColossalAI copied to clipboard

[BUG]: The embedding weight don't assignment when I used geminidpp

🐛 Describe the bug

still work and params on cpu

not work and params on gpu

Environment

ColossalAI ColossalAI copied to clipboard

[BUG]: The embedding weight don't assignment when I used geminidpp

🐛 Describe the bug

still work and params on cpu

not work and params on gpu

Environment

ColossalAI
ColossalAI copied to clipboard