'''
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in :14 │
│ │
│ 11 │ │ metric(output.logits, labels, loss) │
│ 12 │ │ acc, train_loss = metric.get_metric() │
│ 13 │ │ │
│ ❱ 14 │ │ accelerator.backward(loss) │
│ 15 │ │ optimizer.step() │
│ 16 │ │ │
│ 17 │ │ if not accelerator.optimizer_step_was_skipped: │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/accelerate/accelerator.py:1634 in │
│ backward │
│ │
│ 1631 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │
│ 1632 │ │ │ return │
│ 1633 │ │ elif self.scaler is not None: │
│ ❱ 1634 │ │ │ self.scaler.scale(loss).backward(**kwargs) │
│ 1635 │ │ else: │
│ 1636 │ │ │ loss.backward(**kwargs) │
│ 1637 │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/_tensor.py:488 in backward │
│ │
│ 485 │ │ │ │ create_graph=create_graph, │
│ 486 │ │ │ │ inputs=inputs, │
│ 487 │ │ │ ) │
│ ❱ 488 │ │ torch.autograd.backward( │
│ 489 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │
│ 490 │ │ ) │
│ 491 │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/init.py:197 in │
│ backward │
│ │
│ 194 │ # The reason we repeat same the comment below is that │
│ 195 │ # some Python versions print out the first line of a multi-line function │
│ 196 │ # calls in the traceback and some print out the last line │
│ ❱ 197 │ Variable.execution_engine.run_backward( # Calls into the C++ engine to run the bac │
│ 198 │ │ tensors, grad_tensors, retain_graph, create_graph, inputs, │
│ 199 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │
│ 200 │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/function.py:267 in │
│ apply │
│ │
│ 264 │ │ │ │ │ │ │ "Function is not allowed. You should only implement one " │
│ 265 │ │ │ │ │ │ │ "of them.") │
│ 266 │ │ user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn │
│ ❱ 267 │ │ return user_fn(self, *args) │
│ 268 │ │
│ 269 │ def apply_jvp(self, *args): │
│ 270 │ │ # _forward_cls is defined by derived class │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/utils/checkpoint.py:157 in │
│ backward │
│ │
│ 154 │ │ │ raise RuntimeError( │
│ 155 │ │ │ │ "none of output has requires_grad=True," │
│ 156 │ │ │ │ " this checkpoint() is not necessary") │
│ ❱ 157 │ │ torch.autograd.backward(outputs_with_grad, args_with_grad) │
│ 158 │ │ grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None │
│ 159 │ │ │ │ │ for inp in detached_inputs) │
│ 160 │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/init.py:197 in │
│ backward │
│ │
│ 194 │ # The reason we repeat same the comment below is that │
│ 195 │ # some Python versions print out the first line of a multi-line function │
│ 196 │ # calls in the traceback and some print out the last line │
│ ❱ 197 │ Variable.execution_engine.run_backward( # Calls into the C++ engine to run the bac │
│ 198 │ │ tensors, grad_tensors, retain_graph, create_graph, inputs, │
│ 199 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │
│ 200 │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/function.py:267 in │
│ apply │
│ │
│ 264 │ │ │ │ │ │ │ "Function is not allowed. You should only implement one " │
│ 265 │ │ │ │ │ │ │ "of them.") │
│ 266 │ │ user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn │
│ ❱ 267 │ │ return user_fn(self, *args) │
│ 268 │ │
│ 269 │ def apply_jvp(self, *args): │
│ 270 │ │ # _forward_cls is defined by derived class │
│ │
│ /home/wangyan/miniconda3/envs/moss/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py:1 │
│ 22 in decorate_bwd │
│ │
│ 119 │ @functools.wraps(bwd) │
│ 120 │ def decorate_bwd(*args, **kwargs): │
│ 121 │ │ with autocast(args[0]._fwd_used_autocast): │
│ ❱ 122 │ │ │ return bwd(*args, **kwargs) │
│ 123 │ return decorate_bwd │
│ 124 │
│ │
│ /home/wangyan/.cache/huggingface/modules/transformers_modules/local/quantization.py:292 in │
│ backward │
│ │
│ 289 │ │ grad_input = None │
│ 290 │ │ │
│ 291 │ │ if ctx.needs_input_grad[0]: │
│ ❱ 292 │ │ │ grad_input = transpose_matmul248(grad_output, qweight, scales, qzeros, g_idx │
│ 293 │ │ return grad_input, None, None, None, None, None, None │
│ 294 │
│ 295 class QuantLinear(nn.Module): │
│ │
│ /home/wangyan/.cache/huggingface/modules/transformers_modules/local/quantization.py:265 in │
│ transpose_matmul248 │
│ │
│ 262 │ output = torch.empty((input.shape[0], output_dim), device='cuda', dtype=torch.float1 │
│ 263 │ grid = lambda META: ( │
│ 264 │ triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_dim, META['BL │
│ ❱ 265 │ transpose_matmul_248_kernel[grid](input, qweight, output, │
│ 266 │ │ │ │ │ │ │ │ │ scales, qzeros, g_idx, │
│ 267 │ │ │ │ │ │ │ │ │ input.shape[0], qweight.shape[1], output_dim, bits │
│ 268 │ │ │ │ │ │ │ │ │ input.stride(0), input.stride(1), │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'transpose_matmul_248_kernel' is not defined
'''
尝试对模型进行微调时发生的错误,检查对应位置文件.cache\huggingface\modules\transformers_modules\local\quantization.py 中内容发现上下文和依赖中都不存在 transpose_matmul_248_kernel,但是在上下文中存在 trans_matmul_248_kernel,推测是命名改变并未全部修改导致的,遂将 \MOSS\models\quantization.py 中的 transpose_matmul_248_kernel 改为 trans_matmul_248_kernel,错误解决
@xia-huang-411303 不行啊我改了 .cache里面的文件 和 models里面 对应的quantization.py文件都改了,但是重新跑 脚本.cache的那个文件又变回去了