欢迎使用 MOSS 人工智能助手!输入内容即可进行对话。输入 clear 以清空对话历史,输入 stop 以终止对话。
<|Human|>: 你
after query
/usr/bin/ld: cannot find -lcuda: No such file or directory
collect2: error: ld returned 1 exit status
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in matmul_248_kernel:21 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
KeyError:
('2-.-0-.-0--d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-3d2aedeb40d6d81c66a42791e268f98b-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851
821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.int32, torch.float16, torch.float16, torch.int32, torch.int32, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32',
'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (256, 64, 32, 8), (True, True, True, True, True, True, (False, False), (True, False), (True, False), (False, False), (False, False), (True, False),
(False, True), (True, False), (False, True), (True, False), (False, True), (True, False), (True, False)))
During handling of the above exception, another exception occurred:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /data/MOSS/moss_cli_demo.py:100 in │
│ │
│ 97 │ │ │ print(response.lstrip('\n')) │
│ 98 │
│ 99 if name == "main": │
│ ❱ 100 │ main() │
│ 101 │
│ │
│ /data/MOSS/moss_cli_demo.py:82 in main │
│ │
│ 79 │ │ prompt += '<|Human|>: ' + query + '' │
│ 80 │ │ inputs = tokenizer(prompt, return_tensors="pt") │
│ 81 │ │ with torch.no_grad(): │
│ ❱ 82 │ │ │ outputs = model.generate( │
│ 83 │ │ │ │ inputs.input_ids.cuda(), │
│ 84 │ │ │ │ attention_mask=inputs.attention_mask.cuda(), │
│ 85 │ │ │ │ max_length=2048, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27 in decorate_context │
│ │
│ 24 │ │ @functools.wraps(func) │
│ 25 │ │ def decorate_context(*args, **kwargs): │
│ 26 │ │ │ with self.clone(): │
│ ❱ 27 │ │ │ │ return func(*args, **kwargs) │
│ 28 │ │ return cast(F, decorate_context) │
│ 29 │ │
│ 30 │ def _wrap_generator(self, func): │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1571 in generate │
│ │
│ 1568 │ │ │ ) │
│ 1569 │ │ │ │
│ 1570 │ │ │ # 12. run sample │
│ ❱ 1571 │ │ │ return self.sample( │
│ 1572 │ │ │ │ input_ids, │
│ 1573 │ │ │ │ logits_processor=logits_processor, │
│ 1574 │ │ │ │ logits_warper=logits_warper, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:2534 in sample │
│ │
│ 2531 │ │ │ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) │
│ 2532 │ │ │ │
│ 2533 │ │ │ # forward pass to get next token │
│ ❱ 2534 │ │ │ outputs = self( │
│ 2535 │ │ │ │ **model_inputs, │
│ 2536 │ │ │ │ return_dict=True, │
│ 2537 │ │ │ │ output_attentions=output_attentions, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │
│ │
│ 1191 │ │ # this function, and just call forward. │
│ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │
│ 1195 │ │ # Do not call functions when jit is used │
│ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /data/MOSS/models/modeling_moss.py:678 in forward │
│ │
│ 675 │ │ """ │
│ 676 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 677 │ │ │
│ ❱ 678 │ │ transformer_outputs = self.transformer( │
│ 679 │ │ │ input_ids, │
│ 680 │ │ │ past_key_values=past_key_values, │
│ 681 │ │ │ attention_mask=attention_mask, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │
│ │
│ 1191 │ │ # this function, and just call forward. │
│ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │
│ 1195 │ │ # Do not call functions when jit is used │
│ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /data/MOSS/models/modeling_moss.py:545 in forward │
│ │
│ 542 │ │ │ │ │ head_mask[i], │
│ 543 │ │ │ │ ) │
│ 544 │ │ │ else: │
│ ❱ 545 │ │ │ │ outputs = block( │
│ 546 │ │ │ │ │ hidden_states=hidden_states, │
│ 547 │ │ │ │ │ layer_past=layer_past, │
│ 548 │ │ │ │ │ attention_mask=attention_mask, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │
│ │
│ 1191 │ │ # this function, and just call forward. │
│ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │
│ 1195 │ │ # Do not call functions when jit is used │
│ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /data/MOSS/models/modeling_moss.py:270 in forward │
│ │
│ 267 │ ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor │
│ 268 │ │ residual = hidden_states │
│ 269 │ │ hidden_states = self.ln_1(hidden_states) │
│ ❱ 270 │ │ attn_outputs = self.attn( │
│ 271 │ │ │ hidden_states=hidden_states, │
│ 272 │ │ │ layer_past=layer_past, │
│ 273 │ │ │ attention_mask=attention_mask, │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │
│ │
│ 1191 │ │ # this function, and just call forward. │
│ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1194 │ │ │ return forward_call(*input, **kwargs) │
│ 1195 │ │ # Do not call functions when jit is used │
│ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /data/MOSS/models/modeling_moss.py:164 in forward │
│ │
│ 161 │ │ Tuple[torch.Tensor, Tuple[torch.Tensor]], │
│ 162 │ │ Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], │
│ 163 │ ]: │
│ ❱ 164 │ │ qkv = self.qkv_proj(hidden_states) │
│ 165 │ │ # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass │
│ 166 │ │ mp_num = 4 │
│ 167 │ │ qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1)) │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl │
│ │
│ 1191 │ │ # this function, and just call forward. │
│ 1192 │ │ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o │
│ 1193 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1194 │ │ │ return forward_call(input, **kwargs) │
│ 1195 │ │ # Do not call functions when jit is used │
│ 1196 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1197 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /data/MOSS/models/quantization.py:367 in forward │
│ │
│ 364 │ │
│ 365 │ def forward(self, x): │
│ 366 │ │ out_shape = x.shape[:-1] + (self.outfeatures,) │
│ ❱ 367 │ │ out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.s │
│ 368 │ │ │ │ │ │ │ │ │ │ self.qzeros, self.g_idx, self.bits, self.maxq) │
│ 369 │ │ out = out + self.bias if self.bias is not None else out │
│ 370 │ │ return out.reshape(out_shape) │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py:105 in decorate_fwd │
│ │
│ 102 │ │ │ │ with autocast(enabled=False): │
│ 103 │ │ │ │ │ return fwd(_cast(args, cast_inputs), **_cast(kwargs, cast_inputs)) │
│ 104 │ │ │ else: │
│ ❱ 105 │ │ │ │ return fwd(*args, **kwargs) │
│ 106 │ return decorate_fwd │
│ 107 │
│ 108 │
│ │
│ /data/MOSS/models/quantization.py:279 in forward │
│ │
│ 276 │ @staticmethod │
│ 277 │ @custom_fwd(cast_inputs=torch.float16) │
│ 278 │ def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): │
│ ❱ 279 │ │ output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) │
│ 280 │ │ ctx.save_for_backward(qweight, scales, qzeros, g_idx) │
│ 281 │ │ ctx.bits, ctx.maxq = bits, maxq │
│ 282 │ │ return output │
│ │
│ /data/MOSS/models/quantization.py:250 in matmul248 │
│ │
│ 247 │ output = torch.empty((input.shape[0], qweight.shape[1]), device='cuda', dtype=torch. │
│ 248 │ grid = lambda META: ( │
│ 249 │ triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], ME │
│ ❱ 250 │ matmul_248_kernel[grid](input, qweight, output, │
│ 251 │ │ │ │ │ │ │ scales, qzeros, g_idx, │
│ 252 │ │ │ │ │ │ │ input.shape[0], qweight.shape[1], input.shape[1], bits, maxq │
│ 253 │ │ │ │ │ │ │ input.stride(0), input.stride(1), │
│ │
│ /data/MOSS/models/custom_autotune.py:89 in run │
│ │
│ 86 │ │ │ │ # prune configs │
│ 87 │ │ │ │ pruned_configs = self.prune_configs(kwargs) │
│ 88 │ │ │ │ bench_start = time.time() │
│ ❱ 89 │ │ │ │ timings = {config: self._bench(*args, config=config, **kwargs) │
│ 90 │ │ │ │ │ │ │ for config in pruned_configs} │
│ 91 │ │ │ │ bench_end = time.time() │
│ 92 │ │ │ │ self.bench_time = bench_end - bench_start │
│ │
│ /data/MOSS/models/custom_autotune.py:89 in │
│ │
│ 86 │ │ │ │ # prune configs │
│ 87 │ │ │ │ pruned_configs = self.prune_configs(kwargs) │
│ 88 │ │ │ │ bench_start = time.time() │
│ ❱ 89 │ │ │ │ timings = {config: self._bench(*args, config=config, **kwargs) │
│ 90 │ │ │ │ │ │ │ for config in pruned_configs} │
│ 91 │ │ │ │ bench_end = time.time() │
│ 92 │ │ │ │ self.bench_time = bench_end - bench_start │
│ │
│ /data/MOSS/models/custom_autotune.py:71 in _bench │
│ │
│ 68 │ │ try: │
│ 69 │ │ │ # In testings using only 40 reps seems to be close enough and it appears to │
│ 70 │ │ │ # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll │
│ ❱ 71 │ │ │ return triton.testing.do_bench(kernel_call, rep=40) │
│ 72 │ │ except triton.compiler.OutOfResources: │
│ 73 │ │ │ return float('inf') │
│ 74 │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/triton/testing.py:143 in do_bench │
│ │
│ 140 │ """ │
│ 141 │ │
│ 142 │ # Estimate the runtime of the function │
│ ❱ 143 │ fn() │
│ 144 │ torch.cuda.synchronize() │
│ 145 │ start_event = torch.cuda.Event(enable_timing=True) │
│ 146 │ end_event = torch.cuda.Event(enable_timing=True) │
│ │
│ /data/MOSS/models/custom_autotune.py:67 in kernel_call │
│ │
│ 64 │ │ │ if config.pre_hook: │
│ 65 │ │ │ │ config.pre_hook(self.nargs) │
│ 66 │ │ │ self.hook(args) │
│ ❱ 67 │ │ │ self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, │
│ 68 │ │ try: │
│ 69 │ │ │ # In testings using only 40 reps seems to be close enough and it appears to │
│ 70 │ │ │ # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll │
│ in matmul_248_kernel:41 │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1588 in compile │
│ │
│ 1585 │ │ first_stage = list(stages.keys()).index(ir) │
│ 1586 │ │
│ 1587 │ # cache manager │
│ ❱ 1588 │ so_path = make_stub(name, signature, constants) │
│ 1589 │ # create cache manager │
│ 1590 │ fn_cache_manager = CacheManager(make_hash(fn, **kwargs)) │
│ 1591 │ # determine name and extension type of provided function │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1477 in make_stub │
│ │
│ 1474 │ │ │ src_path = os.path.join(tmpdir, "main.c") │
│ 1475 │ │ │ with open(src_path, "w") as f: │
│ 1476 │ │ │ │ f.write(src) │
│ ❱ 1477 │ │ │ so = _build(name, src_path, tmpdir) │
│ 1478 │ │ │ with open(so, "rb") as f: │
│ 1479 │ │ │ │ so_cache_manager.put(f.read(), so_name, binary=True) │
│ 1480 │ return so_cache_manager._make_path(so_name) │
│ │
│ /opt/miniconda3/lib/python3.10/site-packages/triton/compiler.py:1392 in _build │
│ │
│ 1389 │ │
│ 1390 │ cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir} │
│ 1391 │ cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] │
│ ❱ 1392 │ ret = subprocess.check_call(cc_cmd) │
│ 1393 │ │
│ 1394 │ if ret == 0: │
│ 1395 │ │ return so │
│ │
│ /opt/miniconda3/lib/python3.10/subprocess.py:369 in check_call │
│ │
│ 366 │ │ cmd = kwargs.get("args") │
│ 367 │ │ if cmd is None: │
│ 368 │ │ │ cmd = popenargs[0] │
│ ❱ 369 │ │ raise CalledProcessError(retcode, cmd) │
│ 370 │ return 0 │
│ 371 │
│ 372 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpneqhk1aw/main.c', '-O3', '-I/usr/local/cuda/include', '-I/opt/miniconda3/include/python3.10', '-I/tmp/tmpneqhk1aw', '-shared', '-fPIC', '-lcuda',
'-o', '/tmp/tmpneqhk1aw/matmul_248_kernel.cpython-310-x86_64-linux-gnu.so']' returned non-zero exit status 1.