HARL
HARL copied to clipboard
What is the difference between FP and EP?
if self.state_type == "EP":
data = (
share_obs[:, 0], # (n_threads, share_obs_dim)
obs, # (n_agents, n_threads, obs_dim)
actions, # (n_agents, n_threads, action_dim)
available_actions, # None or (n_agents, n_threads, action_number)
rewards[:, 0], # (n_threads, 1)
np.expand_dims(dones_env, axis=-1), # (n_threads, 1)
valid_transitions.transpose(1, 0, 2), # (n_agents, n_threads, 1)
terms, # (n_threads, 1)
next_share_obs[:, 0], # (n_threads, next_share_obs_dim)
next_obs.transpose(1, 0, 2), # (n_agents, n_threads, next_obs_dim)
next_available_actions, # None or (n_agents, n_threads, next_action_number)
)
elif self.state_type == "FP":
data = (
share_obs, # (n_threads, n_agents, share_obs_dim)
obs, # (n_agents, n_threads, obs_dim)
actions, # (n_agents, n_threads, action_dim)
available_actions, # None or (n_agents, n_threads, action_number)
rewards, # (n_threads, n_agents, 1)
np.expand_dims(dones, axis=-1), # (n_threads, n_agents, 1)
valid_transitions.transpose(1, 0, 2), # (n_agents, n_threads, 1)
terms, # (n_threads, n_agents, 1)
next_share_obs, # (n_threads, n_agents, next_share_obs_dim)
next_obs.transpose(1, 0, 2), # (n_agents, n_threads, next_obs_dim)
next_available_actions, # None or (n_agents, n_threads, next_action_number)
)
When self.state_type == "EP", why is only the reward of the first agent taken rewards[:, 0], and why the reward of the second agent ignored?