agents icon indicating copy to clipboard operation
agents copied to clipboard

Why collect_policy of DDQN agent seems to be unrelated to policy when I reload from checkpoint?

Open aonurgiray opened this issue 1 year ago • 0 comments

I am trying to train a DDQN agent (a self driving car in GTAV) with some state observations and discrete actions. I have done an initial training of 1 million steps and agent is now able to gather around 3,500 rewards averaged in 5 episodes. I now want to continue training it with a lowered epsilon and learning rates parameters. I reload the model using the checkpoint I saved, I see that my q_net weights are updated and when I check agent.collect_policy.trainable_variables[0] before and after I reload the model via "train_checkpointer.initialize_or_restore()" I see that values are updated.

However once I continue training, initial assesment of the model (greedy policy over 5 episodes) show correct behaviour - the one I saved last. But when the training loop is started, I see collect_policy takes actions that are not even close to the greed policy. I expect it'd follow the greedy policy and take random actions along the way with %10 chance, as epsilon is 0.1

Am I reloading this agent wrongly?

import gym
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import policy_saver
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
import os
import matplotlib
import matplotlib.pyplot as plt

os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4"
num_iterations = 2000000 # @param {type:"integer"}

initial_collect_steps = 2000  # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 1000000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-4  # @param {type:"number"}
log_interval = 100  # @param {type:"integer"}

num_eval_episodes = 5  # @param {type:"integer"}
eval_interval = 20000  # @param {type:"integer"}
n_step_update = 2
epsilon = 0.1
gamma = 0.95
target_tau = 0.9
target_period = 1000

def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))
          
def collect_step(environment, policy):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

# Add trajectory to the replay buffer
  replay_buffer.add_batch(traj)


## CREATE ENVIRONMENT ##
env_name = 'GtaEnv-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
## CREATE ENVIRONMENT ##


## CREATE MODEL ##
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]

q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))

q_net = sequential.Sequential(dense_layers + [q_values_layer])

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
## CREATE MODEL ##

## CREATE AGENT ##
train_step_counter = tf.Variable(0)

agent = dqn_agent.DdqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    n_step_update=n_step_update,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    epsilon_greedy=epsilon,
    gamma=gamma,
    target_update_tau= target_tau,
    target_update_period=target_period)

agent.initialize()
## CREATE AGENT ##
# print(q_net.trainable_weights[0])

## START TRAINING STUFF ##


replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

## restore checkpointer ##
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step_counter
)


train_checkpointer.initialize_or_restore()
# print(q_net.trainable_weights[0])

eval_policy = agent.policy
collect_policy = agent.collect_policy

step = agent.train_step_counter.numpy()
print("continue from step: ")
print(step)

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size,
    num_steps=n_step_update + 1).prefetch(3)

iterator = iter(dataset)

print("Initial evaluation of greedy policy: .")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

train_step_counter = tf.Variable(step)


print("Continue training.")
for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    collect_step(train_env, agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience)

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    print("Evaluation greedy policy over 5 episodes. ")
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))

    returns.append(avg_return)
    print(returns)

iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=10000)
plt.show()

In first training I saved the model as:

checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step_counter
)


....


for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    collect_step(train_env, agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience)

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    print("Evaluation greedy policy over 5 episodes. ")
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
    if avg_return > max(returns):
        tf_best_policy_saver.save(best_policy_dir)  ## save the best policy if its return is greater than any return in list

    returns.append(avg_return)
    print(returns)
    train_checkpointer.save(step)

aonurgiray avatar Mar 11 '23 14:03 aonurgiray