CUDA error: an illegal memory access was encountered while increasing the maximum prompt length (max_prompt_length)
I encountered an error while running the 7B Qwen2.5 model on a 2-node 16-H800-80G GRPO training setup using the latest version of the framework (250209).
The error involves an illegal memory access, which occurs when I attempt to increase the maximum prompt length (max_prompt_length).
Specifically, the training process works without issues when data.max_prompt_length is set to less than 6500. However, it breaks down when this value is increased beyond that threshold.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=128 \
data.val_batch_size=128 \
data.max_prompt_length=6500 \
data.max_response_length=8192 \
actor_rollout_ref.model.path=/root/Qwen2.5-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.disable_log_stats=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','mlflow'] \
trainer.project_name='rl_dev' \
trainer.experiment_name='qwen_7b_rl' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=2 \
trainer.save_freq=100 \
trainer.test_freq=20 \
trainer.total_epochs=15 $@
errors
File "/root/venv/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 1708, in execute_model
output: SamplerOutput = self.model.sample(
^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 433, in sample
next_tokens = self.sampler(logits, sampling_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 274, in forward
maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample(
^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 878, in _sample
return _sample_with_torch(
^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 847, in _sample_with_torch
return get_pythonized_sample_results(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 713, in get_pythonized_sample_results
sample_results = _random_sample(seq_groups,
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 512, in _random_sample
random_samples = random_samples.cpu()
^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
File "verl/workers/fsdp_workers.py", line 468, in generate_sequences
output = self.rollout.generate_sequences(prompts=prompts)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "verl/workers/rollout/vllm_rollout/vllm_rollout.py", line 181, in generate_sequences
output = self.inference_engine.generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/utils.py", line 1063, in inner
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 353, in generate
outputs = self._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/workspace/env_run/verl-250208/verl/third_party/vllm/vllm_v_0_6_3/llm.py", line 161, in _run_engine
outputs = super()._run_engine(use_tqdm=use_tqdm)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 879, in _run_engine
step_outputs = self.llm_engine.step()
^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 1386, in step
outputs = self.model_executor.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/workspace/env_run/verl-250208/verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py", line 163, in execute_model
all_outputs = self.worker.execute_model(execute_model_req=execute_model_req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/workspace/env_run/verl-250208/verl/third_party/vllm/vllm_v_0_6_3/worker.py", line 267, in execute_model
return self.model_runner.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/venv/lib/python3.11/site-packages/vllm/worker/model_runner_base.py", line 146, in _wrapper
raise type(err)(f"Error in model execution: "
RuntimeError: Error in model execution: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
File "verl/single_controller/ray/base.py", line 399, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "verl/single_controller/base/decorator.py", line 404, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "verl/workers/fsdp_workers.py", line 464, in generate_sequences
with self.rollout_sharding_manager:
File "verl/workers/sharding_manager/fsdp_vllm.py", line 105, in __exit__
torch.cuda.empty_cache()
File "/root/venv/lib/python3.11/site-packages/torch/cuda/memory.py", line 170, in empty_cache
torch._C._cuda_emptyCache()
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
env
Python== 3.11.3
Package Version
bitsandbytes 0.45.2 databricks-sdk 0.43.0 datasets 2.21.0 deepspeed 0.15.0 flash-attn 2.6.1 gguf 0.10.0 latex2sympy2 1.9.1 liger_kernel 0.5.2 lightning-utilities 0.12.0 llvmlite 0.44.0 ninja 1.11.1.1 nltk 3.9.1 numpy 1.26.4 nvidia-cublas-cu12 12.1.3.1 nvidia-cuda-cupti-cu12 12.1.105 nvidia-cuda-nvrtc-cu12 12.1.105 nvidia-cuda-runtime-cu12 12.1.105 nvidia-cudnn-cu12 9.1.0.70 nvidia-cufft-cu12 11.0.2.54 nvidia-curand-cu12 10.3.2.106 nvidia-cusolver-cu12 11.4.5.107 nvidia-cusparse-cu12 12.1.0.106 nvidia-ml-py 12.560.30 nvidia-nccl-cu12 2.20.5 nvidia-nvjitlink-cu12 12.4.127 nvidia-nvtx-cu12 12.1.105 peft 0.12.0 safetensors 0.4.5 scikit-learn 1.6.1 scipy 1.14.1 tensordict 0.5.0 torch 2.4.0 torchaudio 2.5.1 torchmetrics 1.6.1 torchvision 0.19.0 transformers 4.46.1 transformers-stream-generator 0.0.5 triton 3.0.0 vllm 0.6.3 wandb 0.19.6 xformers 0.0.27.post2