verl
verl copied to clipboard
FP16 training has an anomaly on the NPU.
I trained on the NPU using FP16, and found many NaN values in step 1 of the training results.
(TaskRunner pid=1218449) [2025-11-19 17:41:06,020] [INFO] [aggregate_logger.py:54:log]: step:1
actor/entropy:0.8346855640411377
training/rollout_probs_diff_valid:1
training/rollout_probs_diff_max:nan
training/rollout_probs_diff_mean:nan
training/rollout_probs_diff_std:nan
training/rollout_actor_probs_pearson_corr:nan
rollout_corr/rollout_is_veto_fraction:0.0
rollout_corr/rollout_is_catastrophic_token_fraction:0.0
rollout_corr/training_ppl:2.338888647655646
rollout_corr/training_log_ppl:0.8020172032217184
rollout_corr/kl:nan
rollout_corr/k3_kl:nan
rollout_corr/rollout_ppl:nan
rollout_corr/rollout_log_ppl:nan
rollout_corr/log_ppl_diff:nan
rollout_corr/log_ppl_abs_diff:nan
rollout_corr/log_ppl_diff_max:nan
rollout_corr/log_ppl_diff_min:nan
rollout_corr/ppl_ratio:nan
rollout_corr/chi2_token:nan
rollout_corr/chi2_seq:nan
actor/pg_clipfrac:0.0
actor/ppo_kl:-1.7628715015168456e-05
actor/pg_clipfrac_lower:0.0
actor/pg_loss:0.015553878911741018
actor/grad_norm:0.08278856426477432
perf/mfu/actor:0.049761180142850164
perf/max_memory_allocated_gb:54.83790826797485
perf/max_memory_reserved_gb:109.80078125
perf/cpu_memory_used_gb:170.70977020263672
actor/lr:1e-06
training/global_step:1
training/epoch:0
critic/score/mean:0.498046875
critic/score/max:1.0
critic/score/min:0.0
critic/rewards/mean:0.498046875
critic/rewards/max:1.0
critic/rewards/min:0.0
critic/advantages/mean:-0.09820344299077988
critic/advantages/max:2.4748666286468506
critic/advantages/min:-2.4748666286468506
critic/returns/mean:-0.09820344299077988
critic/returns/max:2.4748666286468506
critic/returns/min:-2.4748666286468506
response_length/mean:6326.798828125
response_length/max:8192.0
response_length/min:542.0
response_length/clip_ratio:0.373046875
response_length_non_aborted/mean:6326.798828125
response_length_non_aborted/max:8192.0
response_length_non_aborted/min:542.0
response_length_non_aborted/clip_ratio:0.373046875
response/aborted_ratio:0.0
prompt_length/mean:139.78125
prompt_length/max:721.0
prompt_length/min:34.0
prompt_length/clip_ratio:0.0
num_turns/min:2
num_turns/max:2
num_turns/mean:2.0
timing_s/start_profile:0.0004989374428987503
timing_s/agent_loop/generate_sequences/min:11.98785026371479
timing_s/agent_loop/generate_sequences/max:212.2606815006584
timing_s/agent_loop/generate_sequences/mean:147.83574966904052
timing_s/agent_loop/tool_calls/min:0.0
timing_s/agent_loop/tool_calls/max:0.0
timing_s/agent_loop/tool_calls/mean:0.0
timing_s/agent_loop/slowest/generate_sequences:212.2606815006584
timing_s/agent_loop/slowest/tool_calls:0.0
timing_s/agent_loop/slowest/prompt_length:45
timing_s/agent_loop/slowest/response_length:8192
timing_s/gen:250.10541202127934
timing_s/reward:0.0005288887768983841
timing_s/old_log_prob:308.62451554462314
timing_s/adv:0.8268838245421648
timing_s/update_actor:336.87742983177304
timing_s/dump_rollout_generations:1.661316566169262
timing_s/step:898.3911366835237
timing_s/stop_profile:0.00014478527009487152
timing_per_token_ms/update_actor:0.10174833098656375
timing_per_token_ms/adv:0.00024974676727071334
timing_per_token_ms/gen:0.07720920897350998
perf/total_num_tokens:3310889
perf/time_per_step:898.3911366835237
perf/throughput:460.66919863857794
Found that there are outputs of '!!!!' in trainer.rollout_data_dir:
{"input": "<|User|>In coordinate space, $A = (1,2,3),$ $B = (5,3,1),$ and $C = (3,4,5).$ Find the orthocenter of triangle $ABC.$ Please reason step by step, and put your final answer within \\boxed{}.\n<|Assistant|><think>\n", "output": "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", "gts": "\\left( \\frac{5}{2}, 3, \\frac{7}{2} \\right)", "score": 0.0, "step": 1, "formatted": false}
When the output token is "!", logprobs become NaN, so many statistical values are also NaN. If TIS is applied, then the loss and grad_norm will also become NaN.
What is the reason? This is my startup script.
export RAY_DEDUP_LOGS=0
export VERL_PPO_LOGGING_LEVEL=DEBUG
export VERL_LOGGING_LEVEL=DEBUG
export HYDRA_FULL_ERROR=1
export TRAIN_OUTPUT_DIR=/cache/test_verl_rl_sanity
export ASCEND_PROCESS_LOG_PATH=$TRAIN_OUTPUT_DIR/ascend-log
export TENSORBOARD_DIR=$TRAIN_OUTPUT_DIR/tensorboard_log
export VLLM_ASCEND_ENABLE_NZ=0
export VLLM_USE_V1=1
export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256
export HCCL_CONNECT_TIMEOUT=7200
export HCCL_EXEC_TIMEOUT=7200
export HCCL_IF_BASE_PORT=64000
export HCCL_CONNECT_TIMEOUT=1800
set -x
if [ -z "$MODEL_PATH" ]; then
MODEL_PATH="DeepSeek/DeepSeek_R1_Distill_Qwen_1_5B"
fi
if [ -z "$ALGO" ]; then
# ALGO=PPO-Token-TIS
ALGO=PPO
fi
if [ -z "$DTYPE" ]; then
DTYPE=float16
fi
if [ -z "$LOSS_AGG_MODE" ]; then
# "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
LOSS_AGG_MODE=token-mean
fi
echo $MODEL_PATH
echo $ALGO
echo $DTYPE
echo "${@:1}"
# actor_rollout_ref.rollout.enable_chunked_prefill=False \
# actor_rollout_ref.actor.policy_loss.algo=$ALGO \
# actor_rollout_ref.actor.dtype=$DTYPE \
# actor_rollout_ref.rollout.max_num_seqs=32 \
# Train over a single node, 8 A100-80GB GPUs.
RAY_DEDUP_LOGS=0 PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_batch_size=64 \
data.val_batch_size=512 \
data.max_prompt_length=1024 \
data.max_response_length=8192 \
data.filter_overlong_prompts=true \
data.seed=42 \
custom_reward_function.path=verl_sail_fp16/verl/trainer/ppo/math_grader.py \
custom_reward_function.name=compute_math_score \
actor_rollout_ref.model.use_liger=false \
actor_rollout_ref.model.use_fused_kernels=false \
actor_rollout_ref.model.fused_kernel_options.impl_backend=torch \
actor_rollout_ref.model.enable_gradient_checkpointing=true \
actor_rollout_ref.actor.clip_ratio_high=0.28 \
actor_rollout_ref.actor.use_kl_loss=false \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.kl_loss_coef=0 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.calculate_log_probs=true \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
actor_rollout_ref.rollout.temperature=1 \
actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
actor_rollout_ref.rollout.val_kwargs.n=16 \
actor_rollout_ref.rollout.val_kwargs.do_sample=true \
trainer.nnodes=1 \
trainer.save_freq=200 \
trainer.test_freq=50 \
trainer.log_val_generations=20 \
trainer.max_actor_ckpt_to_keep=8 \
data.train_files=verl_sail_fp16/sanity_test/math_1460.parquet \
data.val_files=[verl_sail_fp16/sanity_test/aime_2024.parquet,verl_sail_fp16/sanity_test/aime_2025.parquet] \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.loss_agg_mode=$LOSS_AGG_MODE \
actor_rollout_ref.actor.fsdp_config.dtype=$DTYPE \
actor_rollout_ref.rollout.dtype=$DTYPE \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.rollout.val_kwargs.n=32 \
actor_rollout_ref.actor.strategy=fsdp \
actor_rollout_ref.ref.strategy=fsdp \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.ref.fsdp_config.param_offload=False \
actor_rollout_ref.model.enable_activation_offload=False \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.use_dynamic_bsz=true \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=20480 \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=9216 \
trainer.project_name=precision-rl \
trainer.experiment_name=sanity_test-$DTYPE-$ALGO \
trainer.logger=['console','tensorboard'] \
trainer.val_before_train=False \
trainer.total_epochs=20 \
trainer.device=npu \
trainer.resume_mode="disable" \
trainer.rollout_data_dir=$TRAIN_OUTPUT_DIR/train_rollout \
trainer.validation_data_dir=$TRAIN_OUTPUT_DIR/eval_rollout \
trainer.default_local_dir=$TRAIN_OUTPUT_DIR trainer.n_gpus_per_node=8 2>&1 | tee /cache/verl_sail.log
My environment: cann: 8.3.rc1 vllm_ascend: 0.11.0.rc1 verl: main branch date 11.18 torch & torch_npu: 2.7.1