MOSS 启动之后，Human内输入，出现报错。如下

欢迎使用 MOSS 人工智能助手！输入内容即可进行对话。输入 clear 以清空对话历史，输入 stop 以终止对话。 <|Human|>: 你 after query /usr/bin/ld: cannot find -lcuda: No such file or directory collect2: error: ld returned 1 exit status ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in matmul_248_kernel:21 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ KeyError: ('2-.-0-.-0--d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-3d2aedeb40d6d81c66a42791e268f98b-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851 821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.int32, torch.float16, torch.float16, torch.int32, torch.int32, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (256, 64, 32, 8), (True, True, True, True, True, True, (False, False), (True, False), (True, False), (False, False), (False, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (True, False), (True, False)))

During handling of the above exception, another exception occurred:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /data/MOSS/moss_cli_demo.py:100 in │ │ │ │ 97 │ │ │ print(response.lstrip('\n')) │ │ 98 │ │ 99 if name == "main": │ │ ❱ 100 │ main() │ │ 101 │ │ │ │ /data/MOSS/moss_cli_demo.py:82 in main │ │ │ │ 79 │ │ prompt += '<|Human|>: ' + query + '' │ │ 80 │ │ inputs = tokenizer(prompt, return_tensors="pt") │ │ 81 │ │ with torch.no_grad(): │ │ ❱ 82 │ │ │ outputs = model.generate( │ │ 83 │ │ │ │ inputs.input_ids.cuda(), │ │ 84 │ │ │ │ attention_mask=inputs.attention_mask.cuda(), │ │ 85 │ │ │ │ max_length=2048, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27 in decorate_context │ │ │ │ 24 │ │ @functools.wraps(func) │ │ 25 │ │ def decorate_context(*args, **kwargs): │ │ 26 │ │ │ with self.clone(): │ │ ❱ 27 │ │ │ │ return func(*args, **kwargs) │ │ 28 │ │ return cast(F, decorate_context) │ │ 29 │ │ │ 30 │ def _wrap_generator(self, func): │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1571 in generate │ │ │ │ 1568 │ │ │ ) │ │ 1569 │ │ │ │ │ 1570 │ │ │ # 12. run sample │ │ ❱ 1571 │ │ │ return self.sample( │ │ 1572 │ │ │ │ input_ids, │ │ 1573 │ │ │ │ logits_processor=logits_processor, │ │ 1574 │ │ │ │ logits_warper=logits_warper, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:2534 in sample │ │ │ │ 2531 │ │ │ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) │ │ 2532 │ │ │ │ │ 2533 │ │ │ # forward pass to get next token │ │ ❱ 2534 │ │ │ outputs = self( │ │ 2535 │ │ │ │ **model_inputs, │ │ 2536 │ │ │ │ return_dict=True, │ │ 2537 │ │ │ │ output_attentions=output_attentions, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │ │ │ │ 1191 │ │ # this function, and just call forward. │ │ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │ │ 1195 │ │ # Do not call functions when jit is used │ │ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /data/MOSS/models/modeling_moss.py:678 in forward │ │ │ │ 675 │ │ """ │ │ 676 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 677 │ │ │ │ ❱ 678 │ │ transformer_outputs = self.transformer( │ │ 679 │ │ │ input_ids, │ │ 680 │ │ │ past_key_values=past_key_values, │ │ 681 │ │ │ attention_mask=attention_mask, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │ │ │ │ 1191 │ │ # this function, and just call forward. │ │ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │ │ 1195 │ │ # Do not call functions when jit is used │ │ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /data/MOSS/models/modeling_moss.py:545 in forward │ │ │ │ 542 │ │ │ │ │ head_mask[i], │ │ 543 │ │ │ │ ) │ │ 544 │ │ │ else: │ │ ❱ 545 │ │ │ │ outputs = block( │ │ 546 │ │ │ │ │ hidden_states=hidden_states, │ │ 547 │ │ │ │ │ layer_past=layer_past, │ │ 548 │ │ │ │ │ attention_mask=attention_mask, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │ │ │ │ 1191 │ │ # this function, and just call forward. │ │ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │ │ 1195 │ │ # Do not call functions when jit is used │ │ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /data/MOSS/models/modeling_moss.py:270 in forward │ │ │ │ 267 │ ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor │ │ 268 │ │ residual = hidden_states │ │ 269 │ │ hidden_states = self.ln_1(hidden_states) │ │ ❱ 270 │ │ attn_outputs = self.attn( │ │ 271 │ │ │ hidden_states=hidden_states, │ │ 272 │ │ │ layer_past=layer_past, │ │ 273 │ │ │ attention_mask=attention_mask, │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │ │ │ │ 1191 │ │ # this function, and just call forward. │ │ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │ │ 1195 │ │ # Do not call functions when jit is used │ │ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /data/MOSS/models/modeling_moss.py:164 in forward │ │ │ │ 161 │ │ Tuple[torch.Tensor, Tuple[torch.Tensor]], │ │ 162 │ │ Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], │ │ 163 │ ]: │ │ ❱ 164 │ │ qkv = self.qkv_proj(hidden_states) │ │ 165 │ │ # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass │ │ 166 │ │ mp_num = 4 │ │ 167 │ │ qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1)) │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │ │ │ │ 1191 │ │ # this function, and just call forward. │ │ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │ │ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1194 │ │ │ return forward_call(input, **kwargs) │ │ 1195 │ │ # Do not call functions when jit is used │ │ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /data/MOSS/models/quantization.py:367 in forward │ │ │ │ 364 │ │ │ 365 │ def forward(self, x): │ │ 366 │ │ out_shape = x.shape[:-1] + (self.outfeatures,) │ │ ❱ 367 │ │ out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.s │ │ 368 │ │ │ │ │ │ │ │ │ │ self.qzeros, self.g_idx, self.bits, self.maxq) │ │ 369 │ │ out = out + self.bias if self.bias is not None else out │ │ 370 │ │ return out.reshape(out_shape) │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py:105 in decorate_fwd │ │ │ │ 102 │ │ │ │ with autocast(enabled=False): │ │ 103 │ │ │ │ │ return fwd(_cast(args, cast_inputs), **_cast(kwargs, cast_inputs)) │ │ 104 │ │ │ else: │ │ ❱ 105 │ │ │ │ return fwd(*args, **kwargs) │ │ 106 │ return decorate_fwd │ │ 107 │ │ 108 │ │ │ │ /data/MOSS/models/quantization.py:279 in forward │ │ │ │ 276 │ @staticmethod │ │ 277 │ @custom_fwd(cast_inputs=torch.float16) │ │ 278 │ def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): │ │ ❱ 279 │ │ output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) │ │ 280 │ │ ctx.save_for_backward(qweight, scales, qzeros, g_idx) │ │ 281 │ │ ctx.bits, ctx.maxq = bits, maxq │ │ 282 │ │ return output │ │ │ │ /data/MOSS/models/quantization.py:250 in matmul248 │ │ │ │ 247 │ output = torch.empty((input.shape[0], qweight.shape[1]), device='cuda', dtype=torch. │ │ 248 │ grid = lambda META: ( │ │ 249 │ triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], ME │ │ ❱ 250 │ matmul_248_kernel[grid](input, qweight, output, │ │ 251 │ │ │ │ │ │ │ scales, qzeros, g_idx, │ │ 252 │ │ │ │ │ │ │ input.shape[0], qweight.shape[1], input.shape[1], bits, maxq │ │ 253 │ │ │ │ │ │ │ input.stride(0), input.stride(1), │ │ │ │ /data/MOSS/models/custom_autotune.py:89 in run │ │ │ │ 86 │ │ │ │ # prune configs │ │ 87 │ │ │ │ pruned_configs = self.prune_configs(kwargs) │ │ 88 │ │ │ │ bench_start = time.time() │ │ ❱ 89 │ │ │ │ timings = {config: self._bench(*args, config=config, **kwargs) │ │ 90 │ │ │ │ │ │ │ for config in pruned_configs} │ │ 91 │ │ │ │ bench_end = time.time() │ │ 92 │ │ │ │ self.bench_time = bench_end - bench_start │ │ │ │ /data/MOSS/models/custom_autotune.py:89 in │ │ │ │ 86 │ │ │ │ # prune configs │ │ 87 │ │ │ │ pruned_configs = self.prune_configs(kwargs) │ │ 88 │ │ │ │ bench_start = time.time() │ │ ❱ 89 │ │ │ │ timings = {config: self._bench(*args, config=config, **kwargs) │ │ 90 │ │ │ │ │ │ │ for config in pruned_configs} │ │ 91 │ │ │ │ bench_end = time.time() │ │ 92 │ │ │ │ self.bench_time = bench_end - bench_start │ │ │ │ /data/MOSS/models/custom_autotune.py:71 in _bench │ │ │ │ 68 │ │ try: │ │ 69 │ │ │ # In testings using only 40 reps seems to be close enough and it appears to │ │ 70 │ │ │ # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll │ │ ❱ 71 │ │ │ return triton.testing.do_bench(kernel_call, rep=40) │ │ 72 │ │ except triton.compiler.OutOfResources: │ │ 73 │ │ │ return float('inf') │ │ 74 │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/triton/testing.py:143 in do_bench │ │ │ │ 140 │ """ │ │ 141 │ │ │ 142 │ # Estimate the runtime of the function │ │ ❱ 143 │ fn() │ │ 144 │ torch.cuda.synchronize() │ │ 145 │ start_event = torch.cuda.Event(enable_timing=True) │ │ 146 │ end_event = torch.cuda.Event(enable_timing=True) │ │ │ │ /data/MOSS/models/custom_autotune.py:67 in kernel_call │ │ │ │ 64 │ │ │ if config.pre_hook: │ │ 65 │ │ │ │ config.pre_hook(self.nargs) │ │ 66 │ │ │ self.hook(args) │ │ ❱ 67 │ │ │ self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, │ │ 68 │ │ try: │ │ 69 │ │ │ # In testings using only 40 reps seems to be close enough and it appears to │ │ 70 │ │ │ # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll │ │ in matmul_248_kernel:41 │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1588 in compile │ │ │ │ 1585 │ │ first_stage = list(stages.keys()).index(ir) │ │ 1586 │ │ │ 1587 │ # cache manager │ │ ❱ 1588 │ so_path = make_stub(name, signature, constants) │ │ 1589 │ # create cache manager │ │ 1590 │ fn_cache_manager = CacheManager(make_hash(fn, **kwargs)) │ │ 1591 │ # determine name and extension type of provided function │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1477 in make_stub │ │ │ │ 1474 │ │ │ src_path = os.path.join(tmpdir, "main.c") │ │ 1475 │ │ │ with open(src_path, "w") as f: │ │ 1476 │ │ │ │ f.write(src) │ │ ❱ 1477 │ │ │ so = _build(name, src_path, tmpdir) │ │ 1478 │ │ │ with open(so, "rb") as f: │ │ 1479 │ │ │ │ so_cache_manager.put(f.read(), so_name, binary=True) │ │ 1480 │ return so_cache_manager._make_path(so_name) │ │ │ │ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1392 in _build │ │ │ │ 1389 │ │ │ 1390 │ cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir} │ │ 1391 │ cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] │ │ ❱ 1392 │ ret = subprocess.check_call(cc_cmd) │ │ 1393 │ │ │ 1394 │ if ret == 0: │ │ 1395 │ │ return so │ │ │ │ /opt/miniconda3/lib/python3.10/subprocess.py:369 in check_call │ │ │ │ 366 │ │ cmd = kwargs.get("args") │ │ 367 │ │ if cmd is None: │ │ 368 │ │ │ cmd = popenargs[0] │ │ ❱ 369 │ │ raise CalledProcessError(retcode, cmd) │ │ 370 │ return 0 │ │ 371 │ │ 372 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpneqhk1aw/main.c', '-O3', '-I/usr/local/cuda/include', '-I/opt/miniconda3/include/python3.10', '-I/tmp/tmpneqhk1aw', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpneqhk1aw/matmul_248_kernel.cpython-310-x86_64-linux-gnu.so']' returned non-zero exit status 1.