System Info

我在单机6张A800的GPU上，使用基于deepseekv3的以及最新的docker环境都对 Moonlight的模型（已剪枝）进行训练。但是总是遇到ValueError: No dot product attention backend is available for the provided inputs. 这个错误。我在网上找了很久，很多人遇到了，但是没有解决。这是A卡才会碰到错误吗？之前5月份也有人提过，有人进行补丁操作，但是我还是遇到了相同的问题。

Information

[ ] The official example scripts
[x] My own modified scripts

Tasks

[x] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[ ] My own task or dataset (give details below)

Reproduction

Training Progress: 0%| | 0/18675 [00:19<?, ?it/s]Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.actor_rollout_compute_log_prob() (pid=74726, ip=192.168.200.218, actor_id=84a7ad04e43adb9433f4faac01000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f58a5ad0490>) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/single_controller/ray/base.py", line 701, in func (TaskRunner pid=70141) return getattr(self.worker_dict[key], name)(*args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/single_controller/base/decorator.py", line 430, in inner (TaskRunner pid=70141) return func(*args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/utils/profiler/performance.py", line 105, in f (TaskRunner pid=70141) return self.log(decorated_function, *args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/utils/profiler/performance.py", line 118, in log (TaskRunner pid=70141) output = func(*args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/utils/profiler/profile.py", line 256, in wrapper (TaskRunner pid=70141) return func(self_instance, *args, **kwargs_inner) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/workers/megatron_workers.py", line 736, in compute_log_prob (TaskRunner pid=70141) output, entropys = self.actor.compute_log_prob(data=data, calculate_entropy=True) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/utils/profiler/performance.py", line 105, in f (TaskRunner pid=70141) return self.log(decorated_function, *args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/utils/profiler/performance.py", line 118, in log (TaskRunner pid=70141) output = func(*args, **kwargs) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/workers/actor/megatron_actor.py", line 193, in compute_log_prob (TaskRunner pid=70141) output = self.forward_backward_batch( (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/workers/actor/megatron_actor.py", line 534, in forward_backward_batch (TaskRunner pid=70141) losses_reduced = forward_backward_func( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/pipeline_parallel/schedules.py", line 1889, in forward_backward_pipelining_without_interleaving (TaskRunner pid=70141) output_tensor, num_tokens = forward_step( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/pipeline_parallel/schedules.py", line 289, in forward_step (TaskRunner pid=70141) output_tensor, loss_func = forward_step_func(data_iterator, model) (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/workers/actor/megatron_actor.py", line 516, in forward_step (TaskRunner pid=70141) output = forward_fn( (TaskRunner pid=70141) File "/workspace/verl/verl-main-new/verl-main/verl/models/mcore/model_forward.py", line 41, in gptmodel_forward (TaskRunner pid=70141) output_orig = model( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/distributed/data_parallel_base.py", line 22, in forward (TaskRunner pid=70141) return self.module(*inputs, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/module.py", line 237, in forward (TaskRunner pid=70141) outputs = self.module(*inputs, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/models/gpt/gpt_model.py", line 372, in forward (TaskRunner pid=70141) hidden_states = self.decoder( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/transformer_block.py", line 581, in forward (TaskRunner pid=70141) hidden_states, context = layer( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/transformer_layer.py", line 875, in __call__ (TaskRunner pid=70141) return super(MegatronModule, self).__call__(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/transformer_layer.py", line 441, in forward (TaskRunner pid=70141) hidden_states, context = self._forward_attention(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/transformer_layer.py", line 501, in _forward_attention (TaskRunner pid=70141) attention_output_with_bias = self.self_attention( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/transformer/multi_latent_attention.py", line 213, in forward (TaskRunner pid=70141) core_attn_out = self.core_attention( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl (TaskRunner pid=70141) return self._call_impl(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl (TaskRunner pid=70141) return forward_call(*args, **kwargs) (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/megatron/core/extensions/transformer_engine.py", line 931, in forward (TaskRunner pid=70141) core_attn_out = super().forward( (TaskRunner pid=70141) File "/usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/attention.py", line 6226, in forward (TaskRunner pid=70141) raise ValueError( (TaskRunner pid=70141) ValueError: No dot product attention backend is available for the provided inputs. Please run with NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 to find out the reasons for disabling all backends. (TaskRunner pid=70141) (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Running with config={'transformer_engine_version': '2.2.0+d0c452c', 'compute_capability': 'sm80', 'flash_attn_version': '2.7.4.post1', 'flash_attn_3_version': 'not installed', 'cudnn_version': '9.8.0', 'qkv_type': <class 'torch.Tensor'>, 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'thd_thd_thd', 'batch_size': 1, 'num_heads': 8, 'num_gqa_groups': 8, 'max_seqlen_q': 576, 'max_seqlen_kv': 576, 'head_dim_qk': 192, 'head_dim_v': 128, 'attn_mask_type': 'padding_causal', 'window_size': (-1, 0), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None}, 'inference_params': None} (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Disabling FlashAttention 2 due to NVTE_FLASH_ATTN=0 (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Disabling UnfusedDotProductAttention due to NVTE_UNFUSED_ATTN=0 (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Disabling FusedAttention as no backend supports the provided input (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=False} (WorkerDict pid=74726) DEBUG:2025-09-05 03:44:10,426:Selected backend = NoBackend

Expected behavior

希望尽快修复这个bug，

Sep 07 '25 07:09 zhenzhenzhizhi

补充一下我使用的脚本，而且这个错误只在moe模型上出现。在密集模型上，会自动开启flashattetion作为后端，因为qk_head和v_dim都是128；但是moe模型的qk=192,v=128.所以flash_attention被禁用，fused被禁用。unfused因为使用变长序列的原因也被禁用。但是我还是不知道怎么修改：

set -x

export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping #export VLLM_FLASH_ATTN_VERSION=3 export HYDRA_FULL_ERROR=1 export NVTE_DEBUG=1 export NVTE_DEBUG_LEVEL=2

export SWANLAB_API_KEY=dMg0cJLqvfJ1TtqGd1vro # 设置在线跟踪模式API export SWANLAB_LOG_DIR=/workspace/verl/verl-main/log # 设置本地日志存储路径 export SWANLAB_MODE=local # 包含四种模式：cloud云端跟踪模式（默认）、cloud-only仅云端跟踪本地不保存文件、local本地跟踪模式、disabled完全不记录用于debug

# 清除所有相关环境变量

export NVTE_FLASH_ATTN = 1

unset NVTE_FUSED_ATTN

unset NVTE_UNFUSED_ATTN

HF_MODEL_PATH=/workspace/resource/Moonlight-16B-A3B-Instruct_16 DIST_CKPT_PATH=/workspace/resource/moonlight_megatron_format_test_16

train_path=/workspace/resource/ouyang_212/data2/ouyang/verl/data/gsm8k/train.parquet test_path=/workspace/resource/ouyang_212/data2/ouyang/verl/data/gsm8k/test.parquet

#python converter_hf_to_mcore.py --hf_model_path /workspace/resource/Moonlight-16B-A3B-Instruct --output_path /workspace/resource/moonlight_megatron_format_test #torchrun --nproc_per_node 4 --nnodes 1 scripts/converter_hf_to_mcore.py --hf_model_path /workspace/resource/Moonlight-16B-A3B-Instruct --output_path /workspace/resource/moonlight_megatron_format_test --trust_remote_code

CUDA_VISIBLE_DEVICES=2,3,4,5 python3 -m verl.trainer.main_ppo --config-path=config
--config-name='ppo_megatron_trainer.yaml'
algorithm.adv_estimator=grpo
data.train_files="$train_path"
data.val_files="$test_path"
data.train_batch_size=4
data.max_prompt_length=256
data.max_response_length=256
data.filter_overlong_prompts=True
data.truncation='error'
data.trust_remote_code=True
actor_rollout_ref.model.path=$HF_MODEL_PATH
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=1
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4
actor_rollout_ref.actor.megatron.expert_model_parallel_size=1
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=1
actor_rollout_ref.actor.megatron.use_dist_checkpointing=True
actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH
+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.001
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.tensor_model_parallel_size=1
actor_rollout_ref.rollout.name=vllm
actor_rollout_ref.rollout.gpu_memory_utilization=0.38
actor_rollout_ref.rollout.n=1
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4
actor_rollout_ref.ref.megatron.expert_model_parallel_size=1
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=1
actor_rollout_ref.ref.megatron.use_dist_checkpointing=True
actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH
algorithm.use_kl_in_reward=False
trainer.critic_warmup=0
trainer.logger='["console"]'
trainer.project_name='verl_grpo_example_gsm8k_math'
trainer.experiment_name='moonlight_megatron_ep'
trainer.n_gpus_per_node=4
trainer.nnodes=1
trainer.save_freq=20
trainer.test_freq=5
trainer.total_epochs=15 $@

Sep 08 '25 10:09 zhenzhenzhizhi

same error

Oct 09 '25 08:10 dtl123456

for deepseek, you should use fused attention backend instead of flash

Oct 29 '25 03:10 Yangruipis

我也遇到了

Nov 13 '25 14:11 yuleiqin