agents icon indicating copy to clipboard operation
agents copied to clipboard

[PPO] PPOAgent works incorrectly

Open MaxTitkov opened this issue 2 years ago • 1 comments

I'm trying to implement a PPO agent to play with LunarLander-v2 with tf_agents library like it was in this tutorial (github repo)

networks.py

from tf_agents.networks import actor_distribution_network, value_network
from tf_agents.networks import actor_distribution_rnn_network, value_rnn_network
import tensorflow as tf

def create_networks(tf_env, lstm=True, custom_nn=True):
	actor_fc_layers = (512, 256, 64)
	value_fc_layers = (512, 256, 64)
	last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
	if lstm:
		print('Enable LSTM')
		lstm_size = (20,)
		actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, lstm_size=lstm_size, output_fc_layer_params=None)
		value_net = value_rnn_network.ValueRnnNetwork(tf_env.observation_spec(),input_fc_layer_params=value_fc_layers, lstm_size=lstm_size, output_fc_layer_params=None)
	else:
		print('Disable LSTM')
		actor_net = actor_distribution_network.ActorDistributionNetwork(tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.tanh)
		value_net = value_network.ValueNetwork(tf_env.observation_spec(), fc_layer_params=value_fc_layers, kernel_initializer=last_init, activation_fn=tf.keras.activations.tanh)

	return actor_net, value_net

main.py

from tensorflow._api.v2.compat.v1 import train
from tf_agents.environments import suite_gym, tf_py_environment
import tensorflow as tf
from tf_agents.agents.ppo import ppo_agent, ppo_clip_agent
from networks import create_networks
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer

import matplotlib.pyplot as plt
import numpy as np

env_name='LunarLander-v2'
# env_name = 'CartPole-v0'
# env_name = 'Pendulum-v0'

train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

learning_rate = 1e-3

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
actor_net, value_net = create_networks(train_env, lstm=False, custom_nn=False)

train_step_counter = tf.Variable(0)

tf_agent = ppo_clip_agent.PPOClipAgent(
	train_env.time_step_spec(),
	train_env.action_spec(),
	optimizer,
	actor_net=actor_net,
	value_net=value_net,
	gradient_clipping=0.5,
	entropy_regularization=0.0,
	importance_ratio_clipping=0.2,
	normalize_observations=False,
  normalize_rewards=False,
  use_gae=True,
  num_epochs=1,
  debug_summaries=False,
  summarize_grads_and_vars=False,
  train_step_counter=train_step_counter
    )

tf_agent.initialize()

replay_buffer_capacity=1001
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=train_env.batch_size, #num_parallel_environments
        max_length=replay_buffer_capacity)


# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
total_episodes = 5000

collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy

collect_episodes_per_iteration=5
collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        train_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_episodes=collect_episodes_per_iteration)

for episode in range(total_episodes):
   collect_driver.run()
   trajectories = replay_buffer.gather_all()
   train_loss, extra = tf_agent.train(experience=trajectories)
   print(f'Episode = {episode}; Train loss = {train_loss}')
   step = tf_agent.train_step_counter.numpy()
   replay_buffer.clear()

   prev_time_step = eval_env.reset()
   # prev_time_step = train_env.reset() # For collect policy visualization
   episodic_reward = 0
  
  while True:
    eval_env.render()
    action_step = eval_policy.action(prev_time_step, eval_policy.get_initial_state(batch_size=1))
    # action_step = collect_policy.action(prev_time_step, collect_policy.get_initial_state(batch_size=1)) # Run collect policy
    time_step = eval_env.step(action_step.action)
    # time_step = train_env.step(action_step.action) # Run collect policy

    episodic_reward += time_step.reward[0]
    # End this episode if Last Step
    if time_step.is_last():
      break

    prev_time_step = time_step

    ep_reward_list.append(episodic_reward)
    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
  
  print("Episode * {} * Avg Reward is ==> {}".format(episode, avg_reward))
  avg_reward_list.append(avg_reward)

# Plotting graph
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

I was trying to change network layer parameters, learning_rate, prioritzed replay buffer, other environments, PPO vs PPOClip agent, Agent hyperparameters, other replay buffer Capacity, other Driver and it parameters, but unlike the agent in the tutorial (github repo), my PPO agent doesn't train correctly in any numbers of steps:

Episode = 3000; Train loss = 2271.637939453125
Episode * 3000 * Avg Reward is ==> -398.26611328125
Episode = 3001; Train loss = 9885.1669921875
Episode * 3001 * Avg Reward is ==> -428.7572326660156

Jupiter notebook is here: https://colab.research.google.com/drive/1hOPP4uG7izcLrO9prbUEilo5U_Kz1DWr?usp=sharing

Where did I go wrong?

MaxTitkov avatar Oct 29 '21 09:10 MaxTitkov

I'm sorry Max, but aligning different PPO implementations are hard, so try to review all the parameters and defaults across them.

sguada avatar Nov 01 '21 17:11 sguada