open-instruct
open-instruct copied to clipboard
OLMO + RL
I put the code here. To reproduce my work, pip install ai2_olmo
and run
for beta in 0.05
do
for lr in 3e-7
do
python mason.py \
--cluster ai2/augusta-google-1 --image nathanl/open_instruct_auto --pure_docker_mode \
--workspace ai2/tulu-3-dev \
--priority high \
--preemptible \
--num_nodes 1 \
--image costah/open_instruct_ppo_ray_olmo \
--budget ai2/allennlp \
--gpus 8 -- pip install --upgrade transformers \&\& python open_instruct/ppo_vllm_thread_ray_gtrl_olmo.py \
--exp_name "ppo_olmo_rm_init_one_epoch_beta_${beta}_lr_${lr}" \
--beta $beta \
--learning_rate $lr \
--dataset_mixer "{\"ai2-adapt-dev/gsm8k_ground_truth\": 1.0}" \
--dataset_train_splits train \
--dataset_eval_mixer "{\"ai2-adapt-dev/gsm8k_math_ground_truth\": 1.0}" \
--dataset_eval_splits test \
--max_token_length 2048 \
--max_prompt_token_length 2048 \
--response_length 1024 \
--model_name_or_path allenai/open_instruct_dev \
--model_revision olmo_7b_soup_anneal_v3.9_4_DPO___model__42__1730863426 \
--reward_model_path allenai/open_instruct_dev \
--reward_model_revision reward_modeling__1__1730930663 \
--non_stop_penalty \
--stop_token eos \
--temperature 1.0 \
--ground_truths_key ground_truth \
--chat_template tulu \
--sft_messages_key messages \
--total_episodes 200000 \
--penalty_reward_value -10.0 \
--deepspeed_stage 3 \
--per_device_train_batch_size 4 \
--local_rollout_forward_batch_size 8 \
--local_mini_batch_size 32 \
--local_rollout_batch_size 32 \
--actor_num_gpus_per_node 7 \
--vllm_tensor_parallel_size 1 \
--num_epochs 1 \
--apply_verifiable_reward true \
--output_dir /output \
--seed 3 \
--num_evals 3 \
--reward_model_multiplier 0.0 \
--no_try_launch_beaker_eval_jobs \
--gradient_checkpointing \
--with_tracking
done
done