agents
agents copied to clipboard
[PPO] PPOAgent works incorrectly
I'm trying to implement a PPO agent to play with LunarLander-v2 with tf_agents library like it was in this tutorial (github repo)
networks.py
from tf_agents.networks import actor_distribution_network, value_network
from tf_agents.networks import actor_distribution_rnn_network, value_rnn_network
import tensorflow as tf
def create_networks(tf_env, lstm=True, custom_nn=True):
actor_fc_layers = (512, 256, 64)
value_fc_layers = (512, 256, 64)
last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
if lstm:
print('Enable LSTM')
lstm_size = (20,)
actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, lstm_size=lstm_size, output_fc_layer_params=None)
value_net = value_rnn_network.ValueRnnNetwork(tf_env.observation_spec(),input_fc_layer_params=value_fc_layers, lstm_size=lstm_size, output_fc_layer_params=None)
else:
print('Disable LSTM')
actor_net = actor_distribution_network.ActorDistributionNetwork(tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.tanh)
value_net = value_network.ValueNetwork(tf_env.observation_spec(), fc_layer_params=value_fc_layers, kernel_initializer=last_init, activation_fn=tf.keras.activations.tanh)
return actor_net, value_net
main.py
from tensorflow._api.v2.compat.v1 import train
from tf_agents.environments import suite_gym, tf_py_environment
import tensorflow as tf
from tf_agents.agents.ppo import ppo_agent, ppo_clip_agent
from networks import create_networks
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
import matplotlib.pyplot as plt
import numpy as np
env_name='LunarLander-v2'
# env_name = 'CartPole-v0'
# env_name = 'Pendulum-v0'
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
learning_rate = 1e-3
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
actor_net, value_net = create_networks(train_env, lstm=False, custom_nn=False)
train_step_counter = tf.Variable(0)
tf_agent = ppo_clip_agent.PPOClipAgent(
train_env.time_step_spec(),
train_env.action_spec(),
optimizer,
actor_net=actor_net,
value_net=value_net,
gradient_clipping=0.5,
entropy_regularization=0.0,
importance_ratio_clipping=0.2,
normalize_observations=False,
normalize_rewards=False,
use_gae=True,
num_epochs=1,
debug_summaries=False,
summarize_grads_and_vars=False,
train_step_counter=train_step_counter
)
tf_agent.initialize()
replay_buffer_capacity=1001
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
tf_agent.collect_data_spec,
batch_size=train_env.batch_size, #num_parallel_environments
max_length=replay_buffer_capacity)
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
total_episodes = 5000
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy
collect_episodes_per_iteration=5
collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
train_env,
collect_policy,
observers=[replay_buffer.add_batch],
num_episodes=collect_episodes_per_iteration)
for episode in range(total_episodes):
collect_driver.run()
trajectories = replay_buffer.gather_all()
train_loss, extra = tf_agent.train(experience=trajectories)
print(f'Episode = {episode}; Train loss = {train_loss}')
step = tf_agent.train_step_counter.numpy()
replay_buffer.clear()
prev_time_step = eval_env.reset()
# prev_time_step = train_env.reset() # For collect policy visualization
episodic_reward = 0
while True:
eval_env.render()
action_step = eval_policy.action(prev_time_step, eval_policy.get_initial_state(batch_size=1))
# action_step = collect_policy.action(prev_time_step, collect_policy.get_initial_state(batch_size=1)) # Run collect policy
time_step = eval_env.step(action_step.action)
# time_step = train_env.step(action_step.action) # Run collect policy
episodic_reward += time_step.reward[0]
# End this episode if Last Step
if time_step.is_last():
break
prev_time_step = time_step
ep_reward_list.append(episodic_reward)
# Mean of last 40 episodes
avg_reward = np.mean(ep_reward_list[-40:])
print("Episode * {} * Avg Reward is ==> {}".format(episode, avg_reward))
avg_reward_list.append(avg_reward)
# Plotting graph
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()
I was trying to change network layer parameters, learning_rate, prioritzed replay buffer, other environments, PPO vs PPOClip agent, Agent hyperparameters, other replay buffer Capacity, other Driver and it parameters, but unlike the agent in the tutorial (github repo), my PPO agent doesn't train correctly in any numbers of steps:
Episode = 3000; Train loss = 2271.637939453125
Episode * 3000 * Avg Reward is ==> -398.26611328125
Episode = 3001; Train loss = 9885.1669921875
Episode * 3001 * Avg Reward is ==> -428.7572326660156
Jupiter notebook is here: https://colab.research.google.com/drive/1hOPP4uG7izcLrO9prbUEilo5U_Kz1DWr?usp=sharing
Where did I go wrong?
I'm sorry Max, but aligning different PPO implementations are hard, so try to review all the parameters and defaults across them.