agents
agents copied to clipboard
Why collect_policy of DDQN agent seems to be unrelated to policy when I reload from checkpoint?
I am trying to train a DDQN agent (a self driving car in GTAV) with some state observations and discrete actions. I have done an initial training of 1 million steps and agent is now able to gather around 3,500 rewards averaged in 5 episodes. I now want to continue training it with a lowered epsilon and learning rates parameters. I reload the model using the checkpoint I saved, I see that my q_net weights are updated and when I check agent.collect_policy.trainable_variables[0] before and after I reload the model via "train_checkpointer.initialize_or_restore()" I see that values are updated.
However once I continue training, initial assesment of the model (greedy policy over 5 episodes) show correct behaviour - the one I saved last. But when the training loop is started, I see collect_policy takes actions that are not even close to the greed policy. I expect it'd follow the greedy policy and take random actions along the way with %10 chance, as epsilon is 0.1
Am I reloading this agent wrongly?
import gym
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import policy_saver
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
import os
import matplotlib
import matplotlib.pyplot as plt
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4"
num_iterations = 2000000 # @param {type:"integer"}
initial_collect_steps = 2000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 1000000 # @param {type:"integer"}
batch_size = 64 # @param {type:"integer"}
learning_rate = 1e-4 # @param {type:"number"}
log_interval = 100 # @param {type:"integer"}
num_eval_episodes = 5 # @param {type:"integer"}
eval_interval = 20000 # @param {type:"integer"}
n_step_update = 2
epsilon = 0.1
gamma = 0.95
target_tau = 0.9
target_period = 1000
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
def collect_step(environment, policy):
time_step = environment.current_time_step()
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
# Add trajectory to the replay buffer
replay_buffer.add_batch(traj)
## CREATE ENVIRONMENT ##
env_name = 'GtaEnv-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
## CREATE ENVIRONMENT ##
## CREATE MODEL ##
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
## CREATE MODEL ##
## CREATE AGENT ##
train_step_counter = tf.Variable(0)
agent = dqn_agent.DdqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
n_step_update=n_step_update,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter,
epsilon_greedy=epsilon,
gamma=gamma,
target_update_tau= target_tau,
target_update_period=target_period)
agent.initialize()
## CREATE AGENT ##
# print(q_net.trainable_weights[0])
## START TRAINING STUFF ##
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_max_length)
## restore checkpointer ##
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
ckpt_dir=checkpoint_dir,
max_to_keep=1,
agent=agent,
policy=agent.policy,
replay_buffer=replay_buffer,
global_step=train_step_counter
)
train_checkpointer.initialize_or_restore()
# print(q_net.trainable_weights[0])
eval_policy = agent.policy
collect_policy = agent.collect_policy
step = agent.train_step_counter.numpy()
print("continue from step: ")
print(step)
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size,
num_steps=n_step_update + 1).prefetch(3)
iterator = iter(dataset)
print("Initial evaluation of greedy policy: .")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
train_step_counter = tf.Variable(step)
print("Continue training.")
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = agent.train(experience)
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
print("Evaluation greedy policy over 5 episodes. ")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
returns.append(avg_return)
print(returns)
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=10000)
plt.show()
In first training I saved the model as:
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
ckpt_dir=checkpoint_dir,
max_to_keep=1,
agent=agent,
policy=agent.policy,
replay_buffer=replay_buffer,
global_step=train_step_counter
)
....
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = agent.train(experience)
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
print("Evaluation greedy policy over 5 episodes. ")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
if avg_return > max(returns):
tf_best_policy_saver.save(best_policy_dir) ## save the best policy if its return is greater than any return in list
returns.append(avg_return)
print(returns)
train_checkpointer.save(step)