FastChat RuntimeError: Tensor on device cpu is not on the expected device meta!"

RuntimeError: Tensor on device cpu is not on the expected device meta!"

Open mderouineau opened this issue 2 years ago • 0 comments

trafficstars

Hi , i am tying to run Fastchat in cpu only mode , i and have the following error , any idea of what i need to do ?

``debian@srv-azrod:~$ python3 -m fastchat.serve.cli --model-pat vicunaWeight --device cpu Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00, 1.89s/it]Some weights of LlamaForCausalLM were not initialized from the model checkpoint at vicunaWeight and are newly initialized: ['model.layers.37.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.38.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.35.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.36.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.32.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.33.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.34.self_attn.rotary_emb.inv_freq', 'model.layers.39.self_attn.rotary_emb.inv_freq'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. USER: jouons a un jeu de role ASSISTANT: ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /usr/lib/python3.9/runpy.py:197 in _run_module_as_main │ │ │ │ 194 │ main_globals = sys.modules["main"].dict │ │ 195 │ if alter_argv: │ │ 196 │ │ sys.argv[0] = mod_spec.origin │ │ ❱ 197 │ return _run_code(code, main_globals, None, │ │ 198 │ │ │ │ │ "main", mod_spec) │ │ 199 │ │ 200 def run_module(mod_name, init_globals=None, │ │ │ │ /usr/lib/python3.9/runpy.py:87 in _run_code │ │ │ │ 84 │ │ │ │ │ loader = loader, │ │ 85 │ │ │ │ │ package = pkg_name, │ │ 86 │ │ │ │ │ spec = mod_spec) │ │ ❱ 87 │ exec(code, run_globals) │ │ 88 │ return run_globals │ │ 89 │ │ 90 def _run_module_code(code, init_globals=None, │ │ │ │ /home/debian/FastChat/fastchat/serve/cli.py:133 in │ │ │ │ 130 │ │ │ │ │ │ choices=["simple", "rich"], help="Display style.") │ │ 131 │ parser.add_argument("--debug", action="store_true") │ │ 132 │ args = parser.parse_args() │ │ ❱ 133 │ main(args) │ │ 134 │ │ │ │ /home/debian/FastChat/fastchat/serve/cli.py:108 in main │ │ │ │ 105 │ else: │ │ 106 │ │ raise ValueError(f"Invalid style for console: {args.style}") │ │ 107 │ try: │ │ ❱ 108 │ │ chat_loop(args.model_path, args.device, args.num_gpus, args.max_gpu_memory, │ │ 109 │ │ │ args.load_8bit, args.conv_template, args.temperature, args.max_new_tokens, │ │ 110 │ │ │ chatio, args.debug) │ │ 111 │ except KeyboardInterrupt: │ │ │ │ /home/debian/FastChat/fastchat/serve/inference.py:268 in chat_loop │ │ │ │ 265 │ │ │ │ 266 │ │ chatio.prompt_for_output(conv.roles[1]) │ │ 267 │ │ output_stream = generate_stream_func(model, tokenizer, params, device) │ │ ❱ 268 │ │ outputs = chatio.stream_output(output_stream, skip_echo_len) │ │ 269 │ │ # NOTE: strip is important to align with the training data. │ │ 270 │ │ conv.messages[-1][-1] = outputs.strip() │ │ 271 │ │ │ │ /home/debian/FastChat/fastchat/serve/cli.py:30 in stream_output │ │ │ │ 27 │ │ │ 28 │ def stream_output(self, output_stream, skip_echo_len: int): │ │ 29 │ │ pre = 0 │ │ ❱ 30 │ │ for outputs in output_stream: │ │ 31 │ │ │ outputs = outputs[skip_echo_len:].strip() │ │ 32 │ │ │ outputs = outputs.split(" ") │ │ 33 │ │ │ now = len(outputs) - 1 │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/utils/_contextlib.py:35 in │ │ generator_context │ │ │ │ 32 │ │ try: │ │ 33 │ │ │ # Issuing None to a generator fires it up │ │ 34 │ │ │ with ctx_factory(): │ │ ❱ 35 │ │ │ │ response = gen.send(None) │ │ 36 │ │ │ │ │ 37 │ │ │ while True: │ │ 38 │ │ │ │ try: │ │ │ │ /home/debian/FastChat/fastchat/serve/inference.py:153 in generate_stream │ │ │ │ 150 │ │ │ │ logits = out.logits │ │ 151 │ │ │ │ past_key_values = out.past_key_values │ │ 152 │ │ │ else: │ │ ❱ 153 │ │ │ │ out = model( │ │ 154 │ │ │ │ │ torch.as_tensor([input_ids], device=device), use_cache=True) │ │ 155 │ │ │ │ logits = out.logits │ │ 156 │ │ │ │ past_key_values = out.past_key_values │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:687 │ │ in forward │ │ │ │ 684 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 685 │ │ │ │ 686 │ │ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) │ │ ❱ 687 │ │ outputs = self.model( │ │ 688 │ │ │ input_ids=input_ids, │ │ 689 │ │ │ attention_mask=attention_mask, │ │ 690 │ │ │ position_ids=position_ids, │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:577 │ │ in forward │ │ │ │ 574 │ │ │ │ │ None, │ │ 575 │ │ │ │ ) │ │ 576 │ │ │ else: │ │ ❱ 577 │ │ │ │ layer_outputs = decoder_layer( │ │ 578 │ │ │ │ │ hidden_states, │ │ 579 │ │ │ │ │ attention_mask=attention_mask, │ │ 580 │ │ │ │ │ position_ids=position_ids, │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:289 │ │ in forward │ │ │ │ 286 │ │ │ │ 287 │ │ residual = hidden_states │ │ 288 │ │ │ │ ❱ 289 │ │ hidden_states = self.input_layernorm(hidden_states) │ │ 290 │ │ │ │ 291 │ │ # Self Attention │ │ 292 │ │ hidden_states, self_attn_weights, present_key_value = self.self_attn( │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:91 │ │ in forward │ │ │ │ 88 │ │ if self.weight.dtype in [torch.float16, torch.bfloat16]: │ │ 89 │ │ │ hidden_states = hidden_states.to(self.weight.dtype) │ │ 90 │ │ │ │ ❱ 91 │ │ return self.weight * hidden_states │ │ 92 │ │ 93 │ │ 94 class LlamaRotaryEmbedding(torch.nn.Module): │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/_prims_common/wrappers.py:220 in _fn │ │ │ │ 217 │ │ │ │ │ if k not in kwargs: │ │ 218 │ │ │ │ │ │ kwargs[k] = out_attr │ │ 219 │ │ │ │ │ ❱ 220 │ │ │ result = fn(*args, **kwargs) │ │ 221 │ │ │ assert ( │ │ 222 │ │ │ │ isinstance(result, TensorLike) │ │ 223 │ │ │ │ and is_tensor │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/_prims_common/wrappers.py:130 in _fn │ │ │ │ 127 │ │ │ } │ │ 128 │ │ │ bound.arguments.update(promoted_args) │ │ 129 │ │ │ │ │ ❱ 130 │ │ │ result = fn(**bound.arguments) │ │ 131 │ │ │ │ │ 132 │ │ │ if isinstance(result, TensorLike): │ │ 133 │ │ │ │ return _maybe_convert_to_dtype(result, result_dtype) │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/_refs/init.py:926 in _ref │ │ │ │ 923 │ │ │ │ ValueError, │ │ 924 │ │ │ ) │ │ 925 │ │ │ a, b = _maybe_broadcast(a, b) │ │ ❱ 926 │ │ │ return prim(a, b) │ │ 927 │ │ │ │ 928 │ │ if has_out: │ │ 929 │ │ │ _ref = out_wrapper()(_ref) │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/_refs/init.py:1532 in mul │ │ │ │ 1529 │ supports_two_python_scalars=True, │ │ 1530 ) │ │ 1531 def mul(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType: │ │ ❱ 1532 │ return prims.mul(a, b) │ │ 1533 │ │ 1534 │ │ 1535 # TODO: add docstring │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/ops.py:287 in call │ │ │ │ 284 │ │ ) │ │ 285 │ │ │ 286 │ def call(self, *args, **kwargs): │ │ ❱ 287 │ │ return self.op(*args, **kwargs or {}) │ │ 288 │ │ │ 289 │ def hash(self): │ │ 290 │ │ return hash(self.op) │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/prims/init.py:346 in │ │ elementwise_meta │ │ │ │ 343 │ if args_with_fixed_dtypes is not None: │ │ 344 │ │ args = list(args_with_fixed_dtypes) + args │ │ 345 │ │ │ ❱ 346 │ utils.check_same_device(*args, allow_cpu_scalar_tensors=True) │ │ 347 │ utils.check_same_shape(*args, allow_cpu_scalar_tensors=True) │ │ 348 │ │ │ 349 │ strides = utils.compute_elementwise_output_strides(*args) │ │ │ │ /home/debian/.local/lib/python3.9/site-packages/torch/_prims_common/init.py:596 in │ │ check_same_device │ │ │ │ 593 │ │ │ │ │ + str(device) │ │ 594 │ │ │ │ │ + "!" │ │ 595 │ │ │ │ ) │ │ ❱ 596 │ │ │ │ raise RuntimeError(msg) │ │ 597 │ │ else: │ │ 598 │ │ │ msg = ( │ │ 599 │ │ │ │ "Unexpected type when checking for same device, " + str(type(arg)) + "!" │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Tensor on device cpu is not on the expected device meta!"

Apr 24 '23 10:04 mderouineau

FastChat FastChat copied to clipboard

RuntimeError: Tensor on device cpu is not on the expected device meta!"

FastChat
FastChat copied to clipboard