Error when training Qwen/Qwen3-30B-A3B-Base with GRPO in multi-node setup
Model: Qwen/Qwen3-30B-A3B-Base
Training Framework: verl (commit: 856f902)
Inference Engine: vLLM==0.8.5+cu128
Hardware Setup: 10 GH200 GPUs (1 GPU per node, multi-node training)
Algorithm: GRPO
Using the same configuration and multi-node setup, I was able to train the Qwen/Qwen3-4B-Base model without any issues. The problem only occurs when switching to the Qwen/Qwen3-30B-A3B-Base model. May I know how to solve this?
Configuration Summary Here is the relevant part of the training configuration I'm using:
'actor_rollout_ref': {
'actor': {
'checkpoint': {'contents': ['model', 'optimizer', 'extra']},
'clip_ratio': 0.2,
'clip_ratio_c': 3.0,
'clip_ratio_high': 0.2,
'clip_ratio_low': 0.2,
'entropy_coeff': 0,
'fsdp_config': {
'fsdp_size': -1,
'grad_offload': True,
'optimizer_offload': True,
'param_offload': True,
'wrap_policy': {'min_num_params': 0}
},
'grad_clip': 1.0,
'kl_loss_coef': 0.001,
'kl_loss_type': 'low_var_kl',
'loss_agg_mode': 'token-mean',
'optim': {
'lr': 1e-06,
'lr_warmup_steps': -1,
'lr_warmup_steps_ratio': 0.0,
'min_lr_ratio': None,
'total_training_steps': -1,
'warmup_style': 'constant',
'weight_decay': 0.01
},
'ppo_epochs': 1,
'ppo_max_token_len_per_gpu': 16384,
'ppo_micro_batch_size': None,
'ppo_micro_batch_size_per_gpu': 1,
'ppo_mini_batch_size': 10,
'shuffle': False,
'strategy': 'fsdp',
'ulysses_sequence_parallel_size': 1,
'use_dynamic_bsz': False,
'use_kl_loss': True,
'use_torch_compile': True
},
'hybrid_engine': True,
'model': {
'enable_gradient_checkpointing': True,
'external_lib': None,
'override_config': {},
'path': 'Qwen/Qwen3-30B-A3B-Base',
'use_liger': False,
'use_remove_padding': True
},
'ref': {
'fsdp_config': {
'param_offload': True,
'wrap_policy': {'min_num_params': 0}
},
'log_prob_max_token_len_per_gpu': 16384,
'log_prob_micro_batch_size': None,
'log_prob_micro_batch_size_per_gpu': 2,
'log_prob_use_dynamic_bsz': False,
'strategy': 'fsdp',
'ulysses_sequence_parallel_size': 1,
'use_torch_compile': True
},
'rollout': {
'chat_scheduler': None,
'disable_log_stats': True,
'do_sample': True,
'dtype': 'bfloat16',
'enable_chunked_prefill': True,
'enforce_eager': True,
'engine_kwargs': {'swap_space': None},
'free_cache_engine': True,
'gpu_memory_utilization': 0.7,
'ignore_eos': False,
'load_format': 'dummy_dtensor',
'log_prob_max_token_len_per_gpu': 16384,
'log_prob_micro_batch_size': None,
'log_prob_micro_batch_size_per_gpu': 2,
'log_prob_use_dynamic_bsz': False,
'max_model_len': None,
'max_num_batched_tokens': 8192,
'max_num_seqs': 1024,
'mode': 'sync',
'multi_turn': {
'enable': False,
'format': 'chatml',
'max_turns': None,
'tool_config_path': None
},
'n': 5,
'name': 'vllm',
'prompt_length': 2048,
'response_length': 4096,
'temperature': 1.0,
'tensor_model_parallel_size': 2,
'top_k': -1,
'top_p': 1,
'use_fire_sampling': False,
'val_kwargs': {
'do_sample': False,
'n': 1,
'temperature': 0,
'top_k': -1,
'top_p': 1.0
}
}
},
'algorithm': {
'adv_estimator': 'grpo',
'gamma': 1.0,
'kl_ctrl': {
'horizon': 10000,
'kl_coef': 0.001,
'target_kl': 0.1,
'type': 'fixed'
},
'kl_penalty': 'kl',
'lam': 1.0,
'norm_adv_by_std_in_grpo': True,
'use_kl_in_reward': False
},
'base_model': 'Qwen/Qwen3-30B-A3B-Base',
'critic': {
'checkpoint': {'contents': ['model', 'optimizer', 'extra']},
'cliprange_value': 0.5,
'forward_max_token_len_per_gpu': 32768,
'forward_micro_batch_size': None,
'forward_micro_batch_size_per_gpu': 1,
'grad_clip': 1.0,
'model': {
'enable_gradient_checkpointing': True,
'external_lib': None,
'fsdp_config': {
'fsdp_size': -1,
'optimizer_offload': False,
'param_offload': False,
'wrap_policy': {'min_num_params': 0}
},
'override_config': {},
'path': 'Qwen/Qwen3-30B-A3B-Base',
'tokenizer_path': 'Qwen/Qwen3-30B-A3B-Base',
'use_remove_padding': False
},
'optim': {
'lr': 1e-05,
'lr_warmup_steps_ratio': 0.0,
'min_lr_ratio': None,
'total_training_steps': -1,
'warmup_style': 'constant',
'weight_decay': 0.01
},
'ppo_epochs': 1,
'ppo_max_token_len_per_gpu': 32768,
'ppo_micro_batch_size': None,
'ppo_micro_batch_size_per_gpu': 1,
'ppo_mini_batch_size': 10,
'rollout_n': 5,
'shuffle': False,
'strategy': 'fsdp',
'ulysses_sequence_parallel_size': 1,
'use_dynamic_bsz': False
},
'custom_reward_function': {'name': 'compute_score', 'path': None},
'data': {
'custom_cls': {'name': None, 'path': None},
'filter_overlong_prompts': False,
'filter_overlong_prompts_workers': 1,
'image_key': 'images',
'max_prompt_length': 2048,
'max_response_length': 4096,
'prompt_key': 'prompt',
'return_raw_chat': False,
'return_raw_input_ids': False,
'reward_fn_key': 'data_source',
'shuffle': True,
'tokenizer': None,
'train_batch_size': 10,
'train_files': '/admin/data/v6/train_new_v6_withtype.parquet',
'truncation': 'error',
'val_batch_size': None,
'val_files': '/admin/data/v6/test_new_v6_withtype.parquet',
'video_key': 'videos'
},
'ngpus': 4,
'ray_init': {'num_cpus': None},
'reward_model': {
'enable': False,
'forward_max_token_len_per_gpu': 32768,
'launch_reward_fn_async': False,
'max_length': None,
'micro_batch_size': None,
'micro_batch_size_per_gpu': None,
'model': {
'external_lib': None,
'fsdp_config': {
'fsdp_size': -1,
'param_offload': False,
'wrap_policy': {'min_num_params': 0}
},
'input_tokenizer': 'Qwen/Qwen3-30B-A3B-Base',
'path': '~/models/FsfairX-LLaMA3-RM-v0.1',
'use_remove_padding': False
},
'reward_manager': 'naive',
'strategy': 'fsdp',
'ulysses_sequence_parallel_size': 1,
'use_dynamic_bsz': False
},
'rollout_tp_size': 2,
'trainer': {
'balance_batch': True,
'critic_warmup': 0,
'default_hdfs_dir': None,
'default_local_dir': 'checkpoints/verl_examples/gsm8k',
'del_local_ckpt_after_load': False,
'experiment_name': 'gsm8k',
'log_val_generations': 0,
'logger': ['console', 'wandb'],
'max_actor_ckpt_to_keep': None,
'max_critic_ckpt_to_keep': None,
'n_gpus_per_node': 1,
'nnodes': 10,
'project_name': 'verl_examples',
'ray_wait_register_center_timeout': 300,
'resume_from_path': None,
'resume_mode': 'auto',
'rollout_data_dir': None,
'save_freq': 100,
'test_freq': 100,
'total_epochs': 15,
'total_training_steps': None,
'val_before_train': True,
'validation_data_dir': None
}
}```
**Log:**
```[36m(WorkerDict pid=2040875, ip=10.141.1.5)[0m wrap_policy: functools.partial(<function _or_policy at 0xe705d22959e0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0xe705d22958a0>, transformer_layer_cls={<class 'transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeDecoderLayer'>})])[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=2187936, ip=10.141.1.2)[0m Actor use_remove_padding=True
[36m(WorkerDict pid=2187936, ip=10.141.1.2)[0m Total steps: 94470, num_warmup_steps: 0[32m [repeated 8x across cluster][0m
[36m(WorkerDict pid=2040875, ip=10.141.1.5)[0m WARNING 05-01 05:49:51 [cuda.py:96] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
[36m(WorkerDict pid=1330940)[0m Actor use_remove_padding=True[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=1330940)[0m Total steps: 94470, num_warmup_steps: 0
[36m(WorkerDict pid=2040875, ip=10.141.1.5)[0m WARNING 05-01 05:49:52 [utils.py:2494] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0xe70220d86f30>
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m ERROR 05-01 05:49:53 [pynccl_wrapper.py:229] Failed to load NCCL library from libnccl.so.2. It is expected if you are not running on NVIDIA/AMD GPUs.Otherwise, the nccl library might not exist, be corrupted or it does not support the current platform Linux-6.8.0-1021-nvidia-64k-aarch64-with-glibc2.35. If you already have the library, please set the environment variable VLLM_NCCL_SO_PATH to point to the correct nccl library path.
[36m(WorkerDict pid=2148632, ip=10.141.1.6)[0m WARNING 05-01 05:49:56 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
[36m(WorkerDict pid=2148632, ip=10.141.1.6)[0m WARNING 05-01 05:49:58 [fused_moe.py:670] Using default MoE config. Performance might be sub-optimal! Config file not found at /admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GH200_480GB.json
[36m(WorkerDict pid=1330940)[0m WARNING 05-01 05:49:52 [cuda.py:96] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=1330940)[0m WARNING 05-01 05:49:53 [utils.py:2494] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0xe3690b53f1a0>[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=2005309, ip=10.141.1.7)[0m ERROR 05-01 05:49:53 [pynccl_wrapper.py:229] Failed to load NCCL library from libnccl.so.2. It is expected if you are not running on NVIDIA/AMD GPUs.Otherwise, the nccl library might not exist, be corrupted or it does not support the current platform Linux-6.8.0-1021-nvidia-64k-aarch64-with-glibc2.35. If you already have the library, please set the environment variable VLLM_NCCL_SO_PATH to point to the correct nccl library path.[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m /admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m warnings.warn(
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m kwargs: {'n': 5, 'logprobs': 0, 'max_tokens': 4096, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False}
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m WARNING 05-01 05:49:56 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=1972546, ip=10.141.1.8)[0m WARNING 05-01 05:49:58 [fused_moe.py:670] Using default MoE config. Performance might be sub-optimal! Config file not found at /admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GH200_480GB.json[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=2061147, ip=10.141.1.3)[0m Only support config type of {'deepseek_v3', 'llama', 'qwen2_vl', 'qwen2_5_vl', 'qwen2'}, but got qwen3_moe. MFU will always be zero.
[36m(WorkerDict pid=2040875, ip=10.141.1.5)[0m Only support config type of {'llama', 'deepseek_v3', 'qwen2_5_vl', 'qwen2_vl', 'qwen2'}, but got qwen3_moe. MFU will always be zero.
[36m(WorkerDict pid=2007944, ip=10.141.1.10)[0m Only support config type of {'qwen2_5_vl', 'qwen2', 'deepseek_v3', 'qwen2_vl', 'llama'}, but got qwen3_moe. MFU will always be zero.
[36m(TaskRunner pid=1330721)[0m wandb: Currently logged in as: mprabhud (physicsreasoning) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
[36m(TaskRunner pid=1330721)[0m wandb: Tracking run with wandb version 0.19.10
[36m(TaskRunner pid=1330721)[0m wandb: Run data is saved locally in /tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/wandb/run-20250501_055006-wzyoh9ve
[36m(TaskRunner pid=1330721)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(TaskRunner pid=1330721)[0m wandb: Syncing run gsm8k
[36m(TaskRunner pid=1330721)[0m wandb: ⭐️ View project at https://wandb.ai/physicsreasoning/verl_examples
[36m(TaskRunner pid=1330721)[0m wandb: 🚀 View run at https://wandb.ai/physicsreasoning/verl_examples/runs/wzyoh9ve
[36m(TaskRunner pid=1330721)[0m Using LocalLogger is deprecated. The constructor API will change
[36m(TaskRunner pid=1330721)[0m Checkpoint tracker file does not exist: %s /tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/checkpoints/verl_examples/gsm8k/latest_checkpointed_iteration.txt
[36m(TaskRunner pid=1330721)[0m Training from scratch
[36m(TaskRunner pid=1330721)[0m test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True}
[36m(TaskRunner pid=1330721)[0m wandb:
[36m(TaskRunner pid=1330721)[0m wandb: 🚀 View run gsm8k at: https://wandb.ai/physicsreasoning/verl_examples/runs/wzyoh9ve
[36m(TaskRunner pid=1330721)[0m wandb: ⭐️ View project at: https://wandb.ai/physicsreasoning/verl_examples
[36m(TaskRunner pid=1330721)[0m wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[36m(TaskRunner pid=1330721)[0m wandb: Find logs at: ./wandb/run-20250501_055006-wzyoh9ve/logs
[36m(WorkerDict pid=1972546, ip=10.141.1.8)[0m /admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:680: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=1972546, ip=10.141.1.8)[0m warnings.warn([32m [repeated 9x across cluster][0m
Error executing job with overrides: ['exps=[grpo,math_p_qwen3-30B_multinode]', 'trainer.nnodes=10', 'trainer.n_gpus_per_node=1']
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/main_ppo.py", line 181, in <module>
main()
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/main_ppo.py", line 64, in main
run_ppo(config)
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/main_ppo.py", line 79, in run_ppo
ray.get(runner.run.remote(config))
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/ray/_private/worker.py", line 2771, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/ray/_private/worker.py", line 919, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::TaskRunner.run()[39m (pid=1330721, ip=10.141.1.1, actor_id=ee2afccef02a6781ca5d1f8803000000, repr=<main_ppo.TaskRunner object at 0xe1577329bb60>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/main_ppo.py", line 177, in run
trainer.fit()
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/ppo/ray_trainer.py", line 888, in fit
val_metrics = self._validate()
^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/trainer/ppo/ray_trainer.py", line 621, in _validate
test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/single_controller/ray/base.py", line 47, in func
output = ray.get(output)
^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayTaskError(RuntimeError): [36mray::WorkerDict.actor_rollout_generate_sequences()[39m (pid=1990537, ip=10.141.1.9, actor_id=39bdc07ef14b2ea3f9119afb03000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0xfa78af500a10>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/single_controller/ray/base.py", line 451, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/single_controller/base/decorator.py", line 420, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/workers/fsdp_workers.py", line 566, in generate_sequences
with self.rollout_sharding_manager:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/utils/debug/performance.py", line 78, in f
return self.log(decorated_function, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/utils/debug/performance.py", line 88, in log
output = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/workers/sharding_manager/fsdp_vllm.py", line 116, in __enter__
self.update_params(params)
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/workers/sharding_manager/fsdp_vllm.py", line 184, in update_params
loaded_params = model.load_weights(((name, param.full_tensor() if world_size != 1 and hasattr(param, "full_tensor") else param) for name, param in updated_params.items()))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_moe.py", line 538, in load_weights
return loader.load_weights(weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 261, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 222, in _load_module
yield from self._load_module(prefix,
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 195, in _load_module
loaded_params = module_load_weights(weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_moe.py", line 400, in load_weights
for name, loaded_weight in weights:
^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 110, in <genexpr>
for parts, weights_data in group),
^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 101, in <genexpr>
for weight_name, weight_data in weights)
^^^^^^^
File "/tmp/ray/session_2025-05-01_05-36-53_580720_1324271/runtime_resources/working_dir_files/_ray_pkg_309703cefced7999/verl/workers/sharding_manager/fsdp_vllm.py", line 184, in <genexpr>
loaded_params = model.load_weights(((name, param.full_tensor() if world_size != 1 and hasattr(param, "full_tensor") else param) for name, param in updated_params.items()))
^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 570, in full_tensor
redist_res = self.redistribute(
^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 542, in redistribute
return Redistribute.apply(self, device_mesh, placements, async_op)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/tensor/_redistribute.py", line 306, in forward
output = redistribute_local_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
new_local_tensor = current_placement._to_replicate_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/tensor/placement_types.py", line 260, in _to_replicate_tensor
result = funcol.all_gather_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/distributed/_functional_collectives.py", line 205, in all_gather_tensor
tensor = torch.ops._c10d_functional.all_gather_into_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/admin/micromamba/envs/vllm_owen/lib/python3.12/site-packages/torch/_ops.py", line 1158, in __call__
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: NCCL Error 2: unhandled system error (run with NCCL_DEBUG=INFO for details)
[36m(WorkerDict pid=1972546, ip=10.141.1.8)[0m kwargs: {'n': 5, 'logprobs': 0, 'max_tokens': 4096, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False}[32m [repeated 9x across cluster][0m
[36m(WorkerDict pid=2005309, ip=10.141.1.7)[0m Only support config type of {'qwen2', 'qwen2_5_vl', 'llama', 'qwen2_vl', 'deepseek_v3'}, but got qwen3_moe. MFU will always be zero.[32m [repeated 4x across cluster][0m
[36m(WorkerDict pid=1972546, ip=10.141.1.8)[0m Only support config type of {'llama', 'deepseek_v3', 'qwen2_vl', 'qwen2', 'qwen2_5_vl'}, but got qwen3_moe. MFU will always be zero.
[36m(WorkerDict pid=1990537, ip=10.141.1.9)[0m Only support config type of {'qwen2', 'deepseek_v3', 'qwen2_vl', 'qwen2_5_vl', 'llama'}, but got qwen3_moe. MFU will always be zero.[32m [repeated 2x across cluster][0m
Loading weights failed. Did you check CPU/GPU memory usage?
Loading weights failed. Did you check CPU/GPU memory usage?
Yes. I did not observe any CPU/GPU out-of-memory issues. Also, I can successfully train qwen2.5-32B using a similar config with a larger micro batch size, so GPU memory is probably not an issue.
i met the same error.
Loading weights failed. Did you check CPU/GPU memory usage?
Yes. I did not observe any CPU/GPU out-of-memory issues. Also, I can successfully train qwen2.5-32B using a similar config with a larger micro batch size, so GPU memory is probably not an issue.
Can you share your training script for Qwen2.5 32B on multi-node GRPO training? @QinOwen
Did this solved? I met the same problem.
same issue @eric-haibin-lin