ColossalAI
ColossalAI copied to clipboard
[BUG]: The embedding weight don't assignment when I used geminidpp
🐛 Describe the bug
It seems that the embedding weight don't assignment when I package the model with geminidpp. The model works when I init with from_pretrained function, but failed with geminidpp.
still work and params on cpu
with ColoInitContext(device=init_dev): model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, trust_remote_code=True )
not work and params on gpu
model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
│ ❱ 582 │ outputs = model(**input_ids) │
│ 583 │ outputs = model.module.generate(**input_ids, **gen_kwargs) │
│ 584 │ outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]) - 2:] │
│ 585 │ response = tokenizer.decode(outputs) │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py:276 in forward │
│ │
│ 273 │ │ self.module.zero_grad(set_to_none=True) │
│ 274 │ │ self.gemini_manager.pre_iter(*args) │
│ 275 │ │ with ColoParamOpHookManager.use_hooks(self.param_op_hook): │
│ ❱ 276 │ │ │ outputs = self.module(*args, **kwargs) │
│ 277 │ │ if self.force_outputs_fp32: │
│ 278 │ │ │ return _cast_float(outputs, torch.float) │
│ 279 │ │ return outputs │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/local/modeling_chatglm.py:1018 in forward │
│ │
│ 1015 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │
│ 1016 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 1017 │ │ │
│ ❱ 1018 │ │ transformer_outputs = self.transformer( │
│ 1019 │ │ │ input_ids=input_ids, │
│ 1020 │ │ │ position_ids=position_ids, │
│ 1021 │ │ │ attention_mask=attention_mask, │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/local/modeling_chatglm.py:838 in forward │
│ │
│ 835 │ │ │ │ ) │
│ 836 │ │ │
│ 837 │ │ if inputs_embeds is None: │
│ ❱ 838 │ │ │ inputs_embeds = self.word_embeddings(input_ids) │
│ 839 │ │ │
│ 840 │ │ # [seq_len, batch, hidden_size] │
│ 841 │ │ hidden_states = inputs_embeds.transpose(0, 1) │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py:1130 in _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or global_backward_hooks: │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/modules/sparse.py:158 in forward │
│ │
│ 155 │ │ │ │ self.weight[self.padding_idx].fill(0) │
│ 156 │ │
│ 157 │ def forward(self, input: Tensor) -> Tensor: │
│ ❱ 158 │ │ return F.embedding( │
│ 159 │ │ │ input, self.weight, self.padding_idx, self.max_norm, │
│ 160 │ │ │ self.norm_type, self.scale_grad_by_freq, self.sparse) │
│ 161 │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/functional.py:2168 in embedding │
│ │
│ 2165 │ """ │
│ 2166 │ │
│ 2167 │ if has_torch_function_variadic(input, weight): │
│ ❱ 2168 │ │ return handle_torch_function( │
│ 2169 │ │ │ embedding, │
│ 2170 │ │ │ (input, weight), │
│ 2171 │ │ │ input, │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/overrides.py:1498 in handle_torch_function │
│ │
│ 1495 │ │ │
│ 1496 │ │ # Use public_api
instead of implementation
so torch_function │
│ 1497 │ │ # implementations can do equality/identity comparisons. │
│ ❱ 1498 │ │ result = torch_func_method(public_api, types, args, kwargs) │
│ 1499 │ │ │
│ 1500 │ │ if result is not NotImplemented: │
│ 1501 │ │ │ return result │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py:183 in │
│ torch_function │
│ │
│ 180 │ │ │ │ return backward_tensor.backward(**tensor_kwargs) │
│ 181 │ │ │
│ 182 │ │ with torch._C.DisableTorchFunction(): │
│ ❱ 183 │ │ │ ret = func(*args, **kwargs) │
│ 184 │ │ │ if func in _get_my_nowrap_functions(): │
│ 185 │ │ │ │ return ret │
│ 186 │ │ │ else: │
│ │
│ /opt/conda/lib/python3.9/site-packages/colossalai/nn/ops/embedding.py:116 in colo_embedding │
│ │
│ 113 │ │
│ 114 │ if not weight.has_compute_spec(): # No Model Parallel Applied │
│ 115 │ │ assert weight.is_replicate(), 'Invalid weight spec for native embedding op' │
│ ❱ 116 │ │ return ColoTensor.from_torch_tensor(tensor=F.embedding(input_tensor, │
│ 117 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ weight, │
│ 118 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ padding_idx=padding_idx, │
│ 119 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ max_norm=max_norm, │
│ │
│ /opt/conda/lib/python3.9/site-packages/torch/nn/functional.py:2199 in embedding │
│ │
│ 2196 │ │ # torch.embedding_renorm │
│ 2197 │ │ # remove once script supports set_grad_enabled │
│ 2198 │ │ no_grad_embedding_renorm(weight, input, max_norm, norm_type) │
│ ❱ 2199 │ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) │
│ 2200 │
│ 2201 │
│ 2202 def embedding_bag( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so
you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
Environment
No response
Can you take a look at this issue #2487 ? Maybe it helps.
Thanks a lot! I'll test.
I solved it by update colossal's version to 0.2.7.