本文主要参考王树森老师的强化学习课程
1.A2C算法原理
A2C算法是策略学习中比较经典的一个算法,是在 Barto 等人1983年提出的。我们知道策略梯度方法用策略梯度更新策略网络参数 θ,从而增大目标函数,即下面的随机梯度:
Actor-Critic 方法中用一个神经网络近似动作价值函数 Q π (s,a),这个神经网络叫做“价值网络”,记为 q(s,a;w),其中的 w 表示神经网络中可训练的参数。价值网络的输入是状态 s,输出是每个动作的价值。动作空间 A 中有多少种动作,那么价值网络的输出就是多少维的向量,向量每个元素对应一个动作。举个例子,动作空间是 A = {左,右,上},价值网络的输出是 :
神经网络可以采用以下结构:
虽然价值网络 q(s,a;w) 与DQN有相同的结构,但是两者的意义不同,训练算法也不同。、
- 价值网络是对动作价值函数 Q π (s,a) 的近似。而 DQN 则是对最优动作价值函数Q ⋆ (s,a) 的近似。
- 对价值网络的训练使用的是SARSA算法,它属于同策略,不能用经验回放。对DQN的训练使用的是 Q 学习算法,它属于异策略,可以用经验回放。
Actor-Critic 翻译成“演员—评论家”方法。策略网络 π(a|s;θ) 相当于演员,它基于状态 s 做出动作 a。价值网络 q(s,a;w) 相当于评论家,它给演员的表现打分,量化在状态 s的情况下做出动作 a 的好坏程度。策略网络(演员)和价值网络(评委)的关系如下图所示。
2. A2C算法训练流程
设当前策略网络参数是θnow ,价值网络参数是Wnow 。执行下面的步骤,将参数更新成 θnew 和 Wnew :
3.A2C代码实现
基于pytorch在gym基础环境中选择经典环境cartpole-v0倒立摆进行验证。
3.1 算法代码:
import torch.optim as optimimport torch.nn as nnimport torch.nn.functional as Ffrom torch.distributions import Categoricalclass ActorCritic(nn.Module): ''' A2C网络模型,包含一个Actor和Critic ''' def __init__(self, input_dim, output_dim, hidden_dim): super(ActorCritic, self).__init__() self.critic = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, output_dim), nn.Softmax(dim=1), ) def forward(self, x): value = self.critic(x) probs = self.actor(x) dist = Categorical(probs) return dist, valueclass A2C: ''' A2C算法 ''' def __init__(self,state_dim,action_dim,cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters()) def compute_returns(self,next_value, rewards, masks): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + self.gamma * R * masks[step] returns.insert(0, R) return returns
3.2 实验代码:
import sysimport oscurr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径parent_path = os.path.dirname(curr_path) # 父路径sys.path.append(parent_path) # 添加路径到系统路径import gymimport numpy as npimport torchimport torch.optim as optimimport datetimefrom common.multiprocessing_env import SubprocVecEnvfrom a2c import ActorCriticfrom common.utils import save_results, make_dirfrom common.utils import plot_rewardscurr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间algo_name = 'A2C' # 算法名称env_name = 'CartPole-v0' # 环境名称class A2CConfig: def __init__(self) -> None: self.algo_name = algo_name# 算法名称 self.env_name = env_name # 环境名称 self.n_envs = 8 # 异步的环境数目 self.gamma = 0.99 # 强化学习中的折扣因子 self.hidden_dim = 256 self.lr = 1e-3 # learning rate self.max_frames = 30000 self.n_steps = 5 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")class PlotConfig: def __init__(self) -> None: self.algo_name = algo_name # 算法名称 self.env_name = env_name # 环境名称 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.result_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/models/' # 保存模型的路径 self.save = True # 是否保存图片def make_envs(env_name): def _thunk(): env = gym.make(env_name) env.seed(2) return env return _thunkdef ceshi_env(env,model,vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(cfg.device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_rewarddef compute_returns(next_value, rewards, masks, gamma=0.99): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + gamma * R * masks[step] returns.insert(0, R) return returnsdef train(cfg,envs): print('开始训练!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) state_dim = envs.observation_space.shape[0] action_dim = envs.action_space.n model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) optimizer = optim.Adam(model.parameters()) frame_idx = 0 test_rewards = [] test_ma_rewards = [] state = envs.reset() while frame_idx < cfg.max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 # rollout trajectory for _ in range(cfg.n_steps): state = torch.FloatTensor(state).to(cfg.device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) state = next_state frame_idx += 1 if frame_idx % 100 == 0: test_reward = np.mean([ceshi_env(env,model) for _ in range(10)]) print(f"frame_idx:{frame_idx}, test_reward:{test_reward}") test_rewards.append(test_reward) if test_ma_rewards: test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward) else: test_ma_rewards.append(test_reward) # plot(frame_idx, test_rewards) next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() print('完成训练!') return test_rewards, test_ma_rewardsif __name__ == "__main__": cfg = A2CConfig() plot_cfg = PlotConfig() envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = SubprocVecEnv(envs) # 训练 rewards,ma_rewards = train(cfg,envs) make_dir(plot_cfg.result_path,plot_cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
3.2 一些依赖的文件(common文件夹)
3.2.1 multiprocessing_env.py(来自 openai baseline,用于多线程环境)
# 该代码来自 openai baseline,用于多线程环境# https://github.com/openai/baselines/tree/master/baselines/common/vec_envimport numpy as npfrom multiprocessing import Process, Pipedef worker(remote, parent_remote, env_fn_wrapper): parent_remote.close() env = env_fn_wrapper.x() while True: cmd, data = remote.recv() if cmd == 'step': ob, reward, done, info = env.step(data) if done: ob = env.reset() remote.send((ob, reward, done, info)) elif cmd == 'reset': ob = env.reset() remote.send(ob) elif cmd == 'reset_task': ob = env.reset_task() remote.send(ob) elif cmd == 'close': remote.close() break elif cmd == 'get_spaces': remote.send((env.observation_space, env.action_space)) else: raise NotImplementedErrorclass VecEnv(object): """ An abstract asynchronous, vectorized environment. """ def __init__(self, num_envs, observation_space, action_space): self.num_envs = num_envs self.observation_space = observation_space self.action_space = action_space def reset(self): """ Reset all the environments and return an array of observations, or a tuple of observation arrays. If step_async is still doing work, that work will be cancelled and step_wait() should not be called until step_async() is invoked again. """ pass def step_async(self, actions): """ Tell all the environments to start taking a step with the given actions. Call step_wait() to get the results of the step. You should not call this if a step_async run is already pending. """ pass def step_wait(self): """ Wait for the step taken with step_async(). Returns (obs, rews, dones, infos): - obs: an array of observations, or a tuple of arrays of observations. - rews: an array of rewards - dones: an array of "episode done" booleans - infos: a sequence of info objects """ pass def close(self): """ Clean up the environments' resources. """ pass def step(self, actions): self.step_async(actions) return self.step_wait() class CloudpickleWrapper(object): """ Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) """ def __init__(self, x): self.x = x def __getstate__(self): import cloudpickle return cloudpickle.dumps(self.x) def __setstate__(self, ob): import pickle self.x = pickle.loads(ob) class SubprocVecEnv(VecEnv): def __init__(self, env_fns, spaces=None): """ envs: list of gym environments to run in subprocesses """ self.waiting = False self.closed = False nenvs = len(env_fns) self.nenvs = nenvs self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] for p in self.ps: p.daemon = True # if the main process crashes, we should not cause things to hang p.start() for remote in self.work_remotes: remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space) def step_async(self, actions): for remote, action in zip(self.remotes, actions): remote.send(('step', action)) self.waiting = True def step_wait(self): results = [remote.recv() for remote in self.remotes] self.waiting = False obs, rews, dones, infos = zip(*results) return np.stack(obs), np.stack(rews), np.stack(dones), infos def reset(self): for remote in self.remotes: remote.send(('reset', None)) return np.stack([remote.recv() for remote in self.remotes]) def reset_task(self): for remote in self.remotes: remote.send(('reset_task', None)) return np.stack([remote.recv() for remote in self.remotes]) def close(self): if self.closed: return if self.waiting: for remote in self.remotes: remote.recv() for remote in self.remotes: remote.send(('close', None)) for p in self.ps: p.join() self.closed = True def __len__(self): return self.nenvs
3.2.2 utils.py(主要是文件创建与绘图函数)
import osimport numpy as npfrom pathlib import Pathimport matplotlib.pyplot as plt# import seaborn as snsfrom matplotlib.font_manager import FontProperties # 导入字体模块def chinese_font(): ''' 设置中文字体,注意需要根据自己电脑情况更改字体路径,否则还是默认的字体 ''' try: font = FontProperties( fname='/System/Library/Fonts/STHeiti Light.ttc', size=15) # fname系统字体路径,此处是mac的 except: font = None return fontdef plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag='train'): ''' 中文画图 ''' # sns.set() plt.figure() plt.title(u"{}环境下{}算法的学习曲线".format(plot_cfg.env_name, plot_cfg.algo_name), fontproperties=chinese_font()) plt.xlabel(u'回合数', fontproperties=chinese_font()) plt.plot(rewards) plt.plot(ma_rewards) plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font()) if plot_cfg.save: plt.savefig(plot_cfg.result_path+f"{tag}_rewards_curve_cn") # plt.show()def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'): # sns.set() plt.figure() # 创建一个图形实例,方便同时多画几个图 plt.title("learning curve on {} of {} for {}".format( plot_cfg.device, plot_cfg.algo_name, plot_cfg.env_name)) plt.xlabel('epsiodes') plt.plot(rewards, label='rewards') plt.plot(ma_rewards, label='ma rewards') plt.legend() if plot_cfg.save: plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) plt.show()def plot_losses(losses, algo="DQN", save=True, path='./'): # sns.set() plt.figure() plt.title("loss curve of {}".format(algo)) plt.xlabel('epsiodes') plt.plot(losses, label='rewards') plt.legend() if save: plt.savefig(path+"losses_curve") plt.show()def save_results(rewards, ma_rewards, tag='train', path='./results'): ''' 保存奖励 ''' np.save(path+'{}_rewards.npy'.format(tag), rewards) np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) print('结果保存完毕!')def make_dir(*paths): ''' 创建文件夹 ''' for path in paths: Path(path).mkdir(parents=True, exist_ok=True)def del_empty_dir(*paths): ''' 删除目录下所有空文件夹 ''' for path in paths: dirs = os.listdir(path) for dir in dirs: if not os.listdir(os.path.join(path, dir)): os.removedirs(os.path.join(path, dir))