agents
agents copied to clipboard
Help with Observation Masking
Hello,
I am currently trying to build a DQN where the legal actions are to be masked. To understand the whole thing I built a small test setup. I also want to use the observation_and_action_constraint_splitter. I haven't applied it yet, because I get an error message before. Maybe someone can help me.
As starting point I took the example from @ormandi from here: https://github.com/tensorflow/agents/issues/397
The network is a DQN. The complete code is this:
#!pip install "gym>=0.21.0"
#!pip install tf-agents
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
import abc
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import q_rnn_network
from tf_agents.utils import common
class CardGameEnvWithMask(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
self._observation_spec = {
'observation': array_spec.BoundedArraySpec(
shape=(1,), dtype=np.int32, minimum=0, name='observation'),
'mask': array_spec.ArraySpec(
shape=(1,), dtype=np.bool, name='mask')}
self._state = 0
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _obs(self, obs, mask=True):
return {'observation': obs, 'mask': np.array([mask], dtype=np.bool_)}
def _reset(self):
self._state = 0
self._episode_ended = False
return ts.restart(self._obs(np.array([self._state], dtype=np.int32)))
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start
# a new episode.
return self.reset()
# Make sure episodes don't go on forever.
if action == 1:
self._episode_ended = True
elif action == 0:
new_card = np.random.randint(1, 11)
self._state += new_card
else:
raise ValueError('`action` should be 0 or 1.')
if self._episode_ended or self._state >= 21:
reward = self._state - 21 if self._state <= 21 else -21
return ts.termination(
self._obs(np.array([self._state], dtype=np.int32), False), reward)
else:
return ts.transition(
self._obs(np.array([self._state], dtype=np.int32), True),
reward=0.0, discount=1.0)
environment = CardGameEnvWithMask()
utils.validate_py_environment(environment, episodes=5)
get_new_card_action = np.array(0, dtype=np.int32)
end_round_action = np.array(1, dtype=np.int32)
environment = CardGameEnvWithMask()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward
for _ in range(3):
time_step = environment.step(get_new_card_action)
print(time_step)
cumulative_reward += time_step.reward
time_step = environment.step(end_round_action)
print(time_step)
cumulative_reward += time_step.reward
print('Final Reward = ', cumulative_reward)
environment.action_spec()
environment.observation_spec()
environment.step(1)
learning_rate = 1e-3
#network configuration
input_fc_layer_params = (40,)
lstm_size=(20,)
output_fc_layer_params=(20,)
# as we are using dictionary in our enviroment, we will create preprocessing layer
preprocessing_layers = {
'mask': tf.keras.layers.Flatten(),
'observation': tf.keras.layers.Flatten()
}
preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
#create a q_RNNnet
q_net = q_rnn_network.QRnnNetwork(
#train_env.time_step_spec(),
environment.observation_spec(),
environment.action_spec(),
preprocessing_layers=preprocessing_layers,
preprocessing_combiner=preprocessing_combiner,
input_fc_layer_params=input_fc_layer_params,
lstm_size=lstm_size,
output_fc_layer_params=output_fc_layer_params
)
#create optimizer
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
#create a global step counter
global_step = tf.compat.v1.train.get_or_create_global_step()
#create agent
agent = dqn_agent.DqnAgent(
environment.time_step_spec(),
environment.action_spec(),
q_network=q_net,
optimizer=optimizer,
#observation_and_action_constraint_splitter=CardGameEnvWithMask.observation_action_splitter,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=global_step)
agent.initialize()
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)
This is the error i get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [10], in <cell line: 8>()
5 global_step = tf.compat.v1.train.get_or_create_global_step()
7 #create agent
----> 8 agent = dqn_agent.DqnAgent(
9 environment.time_step_spec(),
10 environment.action_spec(),
11 q_network=q_net,
12 optimizer=optimizer,
13 #observation_and_action_constraint_splitter=CardGameEnvWithMask.observation_action_splitter,
14 td_errors_loss_fn=common.element_wise_squared_loss,
15 train_step_counter=global_step)
17 agent.initialize()
19 # (Optional) Optimize by wrapping some of the code in a graph using TF function.
File ~\Anaconda3\envs\dqn\lib\site-packages\gin\config.py:1605, in _make_gin_wrapper.<locals>.gin_wrapper(*args, **kwargs)
1603 scope_info = " in scope '{}'".format(scope_str) if scope_str else ''
1604 err_str = err_str.format(name, fn_or_cls, scope_info)
-> 1605 utils.augment_exception_message_and_reraise(e, err_str)
File ~\Anaconda3\envs\dqn\lib\site-packages\gin\utils.py:41, in augment_exception_message_and_reraise(exception, message)
39 proxy = ExceptionProxy()
40 ExceptionProxy.__qualname__ = type(exception).__qualname__
---> 41 raise proxy.with_traceback(exception.__traceback__) from None
File ~\Anaconda3\envs\dqn\lib\site-packages\gin\config.py:1582, in _make_gin_wrapper.<locals>.gin_wrapper(*args, **kwargs)
1579 new_kwargs.update(kwargs)
1581 try:
-> 1582 return fn(*new_args, **new_kwargs)
1583 except Exception as e: # pylint: disable=broad-except
1584 err_str = ''
File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\agents\dqn\dqn_agent.py:235, in DqnAgent.__init__(self, time_step_spec, action_spec, q_network, optimizer, observation_and_action_constraint_splitter, epsilon_greedy, n_step_update, boltzmann_temperature, emit_log_probability, target_q_network, target_update_tau, target_update_period, td_errors_loss_fn, gamma, reward_scale_factor, gradient_clipping, debug_summaries, summarize_grads_and_vars, train_step_counter, name)
232 if observation_and_action_constraint_splitter:
233 net_observation_spec, _ = observation_and_action_constraint_splitter(
234 net_observation_spec)
--> 235 q_network.create_variables(net_observation_spec)
236 if target_q_network:
237 target_q_network.create_variables(net_observation_spec)
File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\networks\network.py:217, in Network.create_variables(self, input_tensor_spec, **kwargs)
212 if input_tensor_spec is None:
213 raise ValueError(
214 "Unable to create_variables: no input_tensor_spec provided, and "
215 "Network did not define one.")
--> 217 random_input = tensor_spec.sample_spec_nest(
218 input_tensor_spec, outer_dims=(1,))
219 initial_state = self.get_initial_state(batch_size=1)
220 step_type = tf.fill((1,), time_step.StepType.FIRST)
File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\specs\tensor_spec.py:400, in sample_spec_nest(structure, seed, outer_dims, minimum, maximum)
397 else:
398 raise TypeError("Spec type not supported: '{}'".format(spec))
--> 400 return tf.nest.map_structure(sample_fn, structure)
File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\util\nest.py:914, in map_structure(func, *structure, **kwargs)
910 flat_structure = (flatten(s, expand_composites) for s in structure)
911 entries = zip(*flat_structure)
913 return pack_sequence_as(
--> 914 structure[0], [func(*x) for x in entries],
915 expand_composites=expand_composites)
File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\util\nest.py:914, in <listcomp>(.0)
910 flat_structure = (flatten(s, expand_composites) for s in structure)
911 entries = zip(*flat_structure)
913 return pack_sequence_as(
--> 914 structure[0], [func(*x) for x in entries],
915 expand_composites=expand_composites)
File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\specs\tensor_spec.py:378, in sample_spec_nest.<locals>.sample_fn(spec)
374 return tf.as_string(
375 sample_bounded_spec(
376 sample_spec, outer_dims=outer_dims, seed=seed_stream()))
377 else:
--> 378 bounded_spec = BoundedTensorSpec.from_spec(spec)
380 spec_max = bounded_spec.maximum
381 if maximum is not None:
File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\framework\tensor_spec.py:321, in BoundedTensorSpec.from_spec(cls, spec)
305 """Returns a `TensorSpec` with the same shape and dtype as `spec`.
306
307 If `spec` is a `BoundedTensorSpec`, then the new spec's bounds are set to
(...)
318 spec: The `TypeSpec` used to create the new `BoundedTensorSpec`.
319 """
320 dtype = dtypes.as_dtype(spec.dtype)
--> 321 minimum = getattr(spec, "minimum", dtype.min)
322 maximum = getattr(spec, "maximum", dtype.max)
323 return BoundedTensorSpec(spec.shape, dtype, minimum, maximum, spec.name)
File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\framework\dtypes.py:97, in DType.min(self)
89 """Returns the minimum representable value in this data type.
90
91 Raises:
92 TypeError: if this is a non-numeric, unordered, or quantized type.
93
94 """
95 if (self.is_quantized or
96 self.base_dtype in (bool, string, complex64, complex128)):
---> 97 raise TypeError(f"Cannot find minimum value of {self} with "
98 f"{'quantized type' if self.is_quantized else 'type'} "
99 f"{self.base_dtype}.")
101 # there is no simple way to get the min value of a dtype, we have to check
102 # float and int types separately
103 try:
TypeError: Cannot find minimum value of <dtype: 'bool'> with type <dtype: 'bool'>.
In call to configurable 'DqnAgent' (<class 'tf_agents.agents.dqn.dqn_agent.DqnAgent'>)
Can anyone help me please? Or a sample code with action masking would also be good to understand all this better.
Thank you!
If you don't pass the observation_action_splitter to DQN then the observation contains both the observation and the mask. And the mask is bool which cannot be sampled. Not sure what is the intended use of the mask when you only have 2 actions.
Also can you make sure you have the latest version since sample_nest_spec can now handle tf.bool https://github.com/tensorflow/agents/blob/master/tf_agents/specs/tensor_spec.py#L380
If you don't pass the observation_action_splitter to DQN then the observation contains both the observation and the mask. And the mask is bool which cannot be sampled. Not sure what is the intended use of the mask when you only have 2 actions.
Also with implementation of:
def observation_action_splitter(obs):
return obs['mask'], obs['observation']
and:
observation_and_action_constraint_splitter=observation_action_splitter
the error message persists.
I know that it makes no sence in this example to mask. It should be an easy example to fix this issue.
Also can you make sure you have the latest version since sample_nest_spec can now handle tf.bool https://github.com/tensorflow/agents/blob/master/tf_agents/specs/tensor_spec.py#L380
the tf-agents version I use is 0.12.0