其他
“老鹰”被 AI 强化学习后,“小鸡”越来越少了?
点击左上方蓝字关注我们
把MADDPG拆分成多个算法
Multi-Agent:多智能体 Deep:与DQN类似,使用目标网络+经验回放 Deterministic:直接输出确定性的动作 Policy Gradient: 基于策略Policy来做梯度下降从而优化模型
PG算法介绍
什么是多智能体?有哪些环境?
从PARL的代码解读MADDPG
我原来的思路是通过PARL里DDPG的代码与MADDPG的代码作比较,但是我发现这两个算法的代码不是一个人写的,在对比时区别比较大,不易从中找到两个算法的区别,因此我打算只看MADDPG的算法,就不做代码对比了。
Algorithm:
self.target_model = deepcopy(model)
Actor-Critir结构
给Actor输入环境的观察值obs,输出的就是动作; 把Actor输出的动作和对应的环境的观察值obs输入给Critir,最后输出Q值。
def predict(self, obs):
""" input:
obs: observation, shape([B] + shape of obs_n[agent_index])
output:
act: action, shape([B] + shape of act_n[agent_index])
"""
this_policy = self.model.policy(obs)
this_action = SoftPDistribution(
logits=this_policy,
act_space=self.act_space[self.agent_index]).sample()
return this_action
def predict_next(self, obs):
""" input: observation, shape([B] + shape of obs_n[agent_index])
output: action, shape([B] + shape of act_n[agent_index])
"""
next_policy = self.target_model.policy(obs)
next_action = SoftPDistribution(
logits=next_policy,
act_space=self.act_space[self.agent_index]).sample()
return next_action
# Critir
def Q(self, obs_n, act_n):
""" input:
obs_n: all agents' observation, shape([B] + shape of obs_n)
output:
act_n: all agents' action, shape([B] + shape of act_n)
"""
return self.model.value(obs_n, act_n)
def Q_next(self, obs_n, act_n):
""" input:
obs_n: all agents' observation, shape([B] + shape of obs_n)
output:
act_n: all agents' action, shape([B] + shape of act_n)
"""
return self.target_model.value(obs_n, act_n)
Actor网络的参数更新
i = self.agent_index
this_policy = self.model.policy(obs_n[i])
sample_this_action = SoftPDistribution(
logits=this_policy,
act_space=self.act_space[self.agent_index]).sample()
action_input_n = act_n + []
action_input_n[i] = sample_this_action
eval_q = self.Q(obs_n, action_input_n)
act_cost = layers.reduce_mean(-1.0 * eval_q)
act_reg = layers.reduce_mean(layers.square(this_policy))
cost = act_cost + act_reg * 1e-3
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
param_list=self.model.get_actor_params())
optimizer = fluid.optimizer.AdamOptimizer(self.lr)
optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
return cost
Critic网络的参数更新
pred_q = self.Q(obs_n, act_n)
cost = layers.reduce_mean(layers.square_error_cost(pred_q, target_q))
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
param_list=self.model.get_critic_params())
optimizer = fluid.optimizer.AdamOptimizer(self.lr)
optimizer.minimize(cost, parameter_list=self.model.get_critic_params())
return cost
设置Agent build_program
self.pred_program = fluid.Program() #Actor
self.learn_program = fluid.Program() #Critic
self.next_q_program = fluid.Program() #target_Critic
self.next_a_program = fluid.Program() #target_Actor
with fluid.program_guard(self.pred_program): #Actor,输入环境的状态量,输出动作
#环境的状态量
obs = layers.data(
name='obs',
shape=[self.obs_dim_n[self.agent_index]],
dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.learn_program): #Critic,输入环境的状态量以及对应的Actor动作,输出评分Q
#环境的状态量
obs_n = [
layers.data(
name='obs' + str(i),
shape=[self.obs_dim_n[i]],
dtype='float32') for i in range(self.n)
]
#Actor根据环境输出的动作
act_n = [
layers.data(
name='act' + str(i),
shape=[self.act_dim_n[i]],
dtype='float32') for i in range(self.n)
]
target_q = layers.data(name='target_q', shape=[], dtype='float32')
self.critic_cost = self.alg.learn(obs_n, act_n, target_q)
with fluid.program_guard(self.next_q_program): #Critic的目标网络,输入环境的状态量以及对应的Actor动作,输出评分Q,用于稳定Q值
#环境的状态量
obs_n = [
layers.data(
name='obs' + str(i),
shape=[self.obs_dim_n[i]],
dtype='float32') for i in range(self.n)
]
#Actor根据环境输出的动作
act_n = [
layers.data(
name='act' + str(i),
shape=[self.act_dim_n[i]],
dtype='float32') for i in range(self.n)
]
self.next_Q = self.alg.Q_next(obs_n, act_n)
with fluid.program_guard(self.next_a_program): #Actor的目标网络,输入环境的状态量,输出动作
#环境的状态量
obs = layers.data(
name='obs',
shape=[self.obs_dim_n[self.agent_index]],
dtype='float32')
self.next_action = self.alg.predict_next(obs)
if self.speedup:
self.pred_program = parl.compile(self.pred_program)
self.learn_program = parl.compile(self.learn_program,
self.critic_cost)
self.next_q_program = parl.compile(self.next_q_program)
self.next_a_program = parl.compile(self.next_a_program)
网络参数里只有obs的就是Actor,因为Actor只需要根据环境的观察值输出动作;
既包含obs,又包含act的就是Critic了,Critic根据Actor输出的动作act以及环境的观察值obs对Actor进行打分,分数就是Q值。
MADDPG算法的核心
self.global_train_step += 1
#经验池有数据且达到一定数量后再learn()
# only update parameter every 100 steps
if self.global_train_step % 100 != 0:
return 0.0
if self.rpm.size() <= self.min_memory_size:
return 0.0
# 从经验池中读取数据,分别是当前环境的状态量、根据当前环境的状态量做的动作、做出动作后的环境状态量
batch_obs_n = []
batch_act_n = []
batch_obs_new_n = []
rpm_sample_index = self.rpm.make_index(self.batch_size)
for i in range(self.n):
batch_obs, batch_act, _, batch_obs_new, _ \
= agents[i].rpm.sample_batch_by_index(rpm_sample_index)
batch_obs_n.append(batch_obs)
batch_act_n.append(batch_act)
batch_obs_new_n.append(batch_obs_new)
_, _, batch_rew, _, batch_isOver \
= self.rpm.sample_batch_by_index(rpm_sample_index)
# compute target q
target_q = 0.0
target_act_next_n = []
for i in range(self.n):
feed = {'obs': batch_obs_new_n[i]}
target_act_next = agents[i].fluid_executor.run(
agents[i].next_a_program, # 每个Agent单独采样
feed=feed,
fetch_list=[agents[i].next_action])[0]
target_act_next_n.append(target_act_next)
feed_obs = {'obs' + str(i): batch_obs_new_n[i] for i in range(self.n)}
feed_act = {'act' + str(i): target_act_next_n[i]for i in range(self.n)}
feed = feed_obs.copy()
feed.update(feed_act) # merge two dict
target_q_next = self.fluid_executor.run(
self.next_q_program, # 可以观测全局的Critic的目标网络,专门用来稳定Q_target
feed=feed,
fetch_list=[self.next_Q])[0]
target_q += (
batch_rew + self.alg.gamma * (1.0 - batch_isOver) * target_q_next)
feed_obs = {'obs' + str(i): batch_obs_n[i] for i in range(self.n)}
feed_act = {'act' + str(i): batch_act_n[i] for i in range(self.n)}
target_q = target_q.astype('float32')
feed = feed_obs.copy()
feed.update(feed_act)
feed['target_q'] = target_q
critic_cost = self.fluid_executor.run(
self.learn_program, # 训练可以观测全局的Critic
feed=feed,
fetch_list=[self.critic_cost])[0]
self.alg.sync_target()
return critic_cost
复现“老鹰捉小鸡”的游戏环境
这个游戏环境在OpenAI的代码库里可以找到,从简单到复杂,一共有6个环境,因为是追逐的游戏,并且官方给的名称不好翻译,我就把这个环境称为“老鹰捉小鸡”。配置游戏所需环境:
!pip uninstall -y pandas scikit-learn # 提示:在AIStudio中卸载这两个库再import parl可避免warning提示,不卸载也不影响parl的使用
!pip install paddlepaddle-gpu==1.6.3.post97 -i https://mirror.baidu.com/pypi/simple
!pip install parl==1.3.1
#一定要安装gym==0.10.5版本的gym,否则报错
!pip install gym==0.10.5 -I https://mirror.baidu.com/pypi/simple
#如果无法运行,请到终端操作
!cd multiagent-particle-envs && !pip install -e .
回归论文
最后,我们回归论文。
总结与展望
MADDPG算法是在DDPG算法的基础上做的改进,其中最核心的思想:一方面继承了DDPG的Actor-Critir即演员-评论家的结构;另一方面,MADDPG在Actor-Critir结构的基础上,让每个智能体Agent的Actor独立地采样,而每个智能体Agent的Critir都有全局的信息,以此在指导Actor做出动作。
END
开源与美国出口管制Facebook 宣布:开源 Instagram 安全工具 Pysa
张东升,我知道是你!如何使用GAN做一个秃头生产器
将会取代现有的开发人员?英特尔推出全新机器自动编程系统
谷歌与微软,勇士与恶龙的身份互换?
觉得不错,请点个在看呀