PARL
PARL copied to clipboard
请问可以提供paddle版本的MATD3算法吗
MATD3 是MADDPG + TD3 吗?
MATD3 是MADDPG + TD3 吗?
是针对多智能体的TD3算法呢,即把MADDPG中的DDPG更换为TD3。因为想使用parl框架,清晰 条理性。感谢
了解的,这个我们之前在做调研的时候有自己实现过。这个的实现成本不高,要不你提个PR给我们? https://github.com/PaddlePaddle/PARL/blob/develop/parl/algorithms/paddle/maddpg.py 在这上面参考maddpg 来修改即可,这样你也可以成为PARL的contributor了:)
了解的,这个我们之前在做调研的时候有自己实现过。这个的实现成本不高,要不你提个PR给我们? https://github.com/PaddlePaddle/PARL/blob/develop/parl/algorithms/paddle/maddpg.py 在这上面参考maddpg 来修改即可,这样你也可以成为PARL的contributor了:)
在尝试对于paddle的MADDPG进行修改后,最终效果不理想,暂时还没有找到问题所在呢><
可以把代码贴在这个issue,我们帮你一起看看的
可以把代码贴在这个issue,我们帮你一起看看的
好的呢 matd3.py
import parl
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from parl.utils.utils import check_model_method
from copy import deepcopy
__all__ = ['MATD3']
from parl.core.paddle.policy_distribution import SoftCategoricalDistribution
from parl.core.paddle.policy_distribution import SoftMultiCategoricalDistribution
def SoftPDistribution(logits, act_space):
if (hasattr(act_space, 'n')):
return SoftCategoricalDistribution(logits)
# is instance of multiagent.multi_discrete.MultiDiscrete
elif (hasattr(act_space, 'num_discrete_space')):
return SoftMultiCategoricalDistribution(logits, act_space.low,
act_space.high)
else:
raise AssertionError("act_space must be instance of \
gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete")
class MATD3(parl.Algorithm):
def __init__(self,
model,
agent_index=None,
act_space=None,
gamma=None,
tau=None,
actor_lr=None,
critic_lr=None,
policy_noise=0.2,
noise_clip=0.5,
policy_freq=2):
check_model_method(model, 'value', self.__class__.__name__)
check_model_method(model, 'policy', self.__class__.__name__)
check_model_method(model, 'Q1', self.__class__.__name__)
check_model_method(model, 'get_actor_params', self.__class__.__name__)
check_model_method(model, 'get_critic_params', self.__class__.__name__)
assert isinstance(agent_index, int)
assert isinstance(act_space, list)
assert isinstance(gamma, float)
assert isinstance(tau, float)
assert isinstance(actor_lr, float)
assert isinstance(critic_lr, float)
self.agent_index = agent_index
self.act_space = act_space
self.gamma = gamma
self.tau = tau
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.policy_noise = policy_noise
self.noise_clip = noise_clip
self.policy_freq = policy_freq
self.total_it = 0
self.model = model
self.target_model = deepcopy(self.model)
self.sync_target(0)
self.actor_optimizer = paddle.optimizer.Adam(
learning_rate=actor_lr, parameters=self.model.get_actor_params(),
grad_clip=nn.ClipGradByNorm(clip_norm=0.5))
self.critic_optimizer = paddle.optimizer.Adam(
learning_rate=critic_lr, parameters=self.model.get_critic_params(),
grad_clip=nn.ClipGradByNorm(clip_norm=0.5))
def predict(self, obs, use_target_model=False):
if use_target_model:
policy = self.target_model.policy(obs)
else:
policy = self.model.policy(obs)
action = SoftPDistribution(
logits=policy,
act_space=self.act_space[self.agent_index]).sample()
return action
def Q(self, obs_n, act_n, use_target_model=False):
if use_target_model:
return self.target_model.value(obs_n, act_n)
else:
return self.model.value(obs_n, act_n)
def learn(self, obs_n, act_n, target_q):
self.total_it += 1
critic_loss = self._critic_learn(obs_n, act_n, target_q)
if self.total_it % self.policy_freq == 0:
actor_loss = self._actor_learn(obs_n, act_n)
return critic_loss
def _critic_learn(self, obs_n, act_n, target_q):
current_q1, current_q2 = self.Q(obs_n, act_n,use_target_model=False)
critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(
current_q2, target_q)
self.critic_optimizer.clear_grad()
critic_loss.backward()
self.critic_optimizer.step()
return critic_loss
def _actor_learn(self, obs_n, act_n):
i = self.agent_index
this_policy = self.model.policy(obs_n[i])
sample_this_action = SoftPDistribution(
logits=this_policy,
act_space=self.act_space[self.agent_index]).sample()
# action_input_n = deepcopy(act_n)
action_input_n = act_n + []
action_input_n[i] = sample_this_action
actor_loss = paddle.mean(-self.model.Q1(obs_n, action_input_n))
act_reg = paddle.mean(paddle.square(this_policy))
cost = act_reg + actor_loss * 1e-3
self.actor_optimizer.clear_grad()
cost.backward()
self.actor_optimizer.step()
self.sync_target()
return cost
def sync_target(self, decay=None):
if decay is None:
decay = 1. - self.tau
self.model.sync_weights_to(self.target_model, decay=decay)
可以把代码贴在这个issue,我们帮你一起看看的
import parl
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class MAModel(parl.Model):
def __init__(self, obs_dim, action_dim, critic_in_dim):
super(MAModel, self).__init__()
self.actor_model = Actor(obs_dim, action_dim)
self.critic_model = Critic(critic_in_dim)
def policy(self, obs):
return self.actor_model(obs)
def value(self, obs, action):
return self.critic_model(obs, action)
def Q1(self, obs, action):
return self.critic_model.Q1(obs, action)
def get_actor_params(self):
return self.actor_model.parameters()
def get_critic_params(self):
return self.critic_model.parameters()
class Actor(parl.Model):
def __init__(self, obs_dim, action_dim):
super(Actor, self).__init__()
self.l1 = nn.Linear(obs_dim, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l2 = nn.Linear(64, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l3 = nn.Linear(64, action_dim,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
def forward(self, obs):
x = F.relu(self.l1(obs))
x = F.relu(self.l2(x))
action = self.l3(x)
return action
class Critic(parl.Model):
def __init__(self, critic_in_dim):
super(Critic, self).__init__()
# Q1 architecture
self.l1 = nn.Linear(critic_in_dim, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l2 = nn.Linear(64, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l3 = nn.Linear(64, 1,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
# Q2 architecture
self.l4 = nn.Linear(critic_in_dim, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l5 = nn.Linear(64, 64,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
self.l6 = nn.Linear(64, 1,
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform()))
def forward(self, obs, action):
#sa = paddle.concat([obs, action], 1)
sa = paddle.concat(obs+action, axis=1)
q1 = F.relu(self.l1(sa))
q1 = F.relu(self.l2(q1))
q1 = self.l3(q1)
q2 = F.relu(self.l4(sa))
q2 = F.relu(self.l5(q2))
q2 = self.l6(q2)
return q1, q2
def Q1(self, obs, action):
sa = paddle.concat(obs+action, axis=1)
q1 = F.relu(self.l1(sa))
q1 = F.relu(self.l2(q1))
q1 = self.l3(q1)
return q1
可以把代码贴在这个issue,我们帮你一起看看的
simple_agent.py部分 修改如下
for i in range(self.n):
target_act_next = agents[i].alg.predict(
batch_obs_next_n[i], use_target_model=True)
noise = np.random.normal(0, self.alg.policy_noise, size=target_act_next.shape)
noise = paddle.clip(noise, -self.alg.noise_clip, self.alg.noise_clip)
target_act_next=target_act_next+noise
target_act_next = paddle.clip(target_act_next, -1., 1.)
target_act_next = target_act_next.detach()
target_act_next_n.append(target_act_next)
target_q1, target_q2 = self.alg.Q(
batch_obs_next_n, target_act_next_n, use_target_model=True)
target_q_next = paddle.minimum(target_q1, target_q2)
target_q = batch_rew + self.alg.gamma * (
1.0 - batch_isOver) * target_q_next.detach()
# learn
critic_cost = self.alg.learn(batch_obs_n, batch_act_n, target_q)
critic_cost = critic_cost.cpu().detach().numpy()[0]
return critic_cos
感觉改的没问题,目前有遇到什么问题呢?算法不收敛?还是算法收敛效果不好?
感觉改的没问题,目前有遇到什么问题呢?算法不收敛?还是算法收敛效果不好?
不收敛呢,Mean episode reward一直不提升
我晚点跑下你的代码看看:)
我晚点跑下你的代码看看:)
好呢!辛苦您
hello,你的代码已经发现了一处bug:
cost = act_reg + actor_loss * 1e-3
这个cost计算时,1e-3要加在左边的act_reg
正则项这上面的。
我们重新写了一份MATD3,就不放在PARL官方仓库了,你可以直接参考:
https://github.com/ZiyuanMa/MATD3
hello,你的代码已经发现了一处bug:
cost = act_reg + actor_loss * 1e-3
这个cost计算时,1e-3要加在左边的
act_reg
正则项这上面的。 我们重新写了一份MATD3,就不放在PARL官方仓库了,你可以直接参考: https://github.com/ZiyuanMa/MATD3
非常感谢您耐心的解答!收获了很多,祝paddle越来越好
hello,你的代码已经发现了一处bug:
cost = act_reg + actor_loss * 1e-3
这个cost计算时,1e-3要加在左边的
act_reg
正则项这上面的。 我们重新写了一份MATD3,就不放在PARL官方仓库了,你可以直接参考: https://github.com/ZiyuanMa/MATD3非常感谢您耐心的解答!收获了很多,祝paddle越来越好
嘿,你有没有MATD3的原始论文?