强化学习经典算法实验之REINFORCE
Table of Contents
算法简介
REINFORCE 算法历史悠久,由 Williams, Ronald J. 于1992年的发表在Machine Learning上 [1]。REINFORCE 算法名称的由来其实与算法的形式有关,它是算法公式英文首字母的缩写:
对应到公式就是:
其中$\Delta \omega$ 为参数的更新幅度,$\alpha$为学习率,$r$为回报,$b$为基线值,$e = \cfrac{\partial\ln \pi_{\omega}}{\partial \omega}$($\pi_\omega$ 为策略公式)。
REINFORCE 算法非常简洁,可由策略梯度定理直接得到,若不了解,可参考《策略梯度算法专题》,本文中所用符号均与《策略梯度算法专题》中的一致。算法工作方式为:
- 随机初始化策略参数$\theta$
- 使用策略$\pi_\theta$来生成一个样本路径: $S_1, A_1, R_2, S_2, A_2, \ldots, S_T$
- For t = 1, 2, ... , T:
- 估计回报$G_t$的值
- 更新策略参数: $\theta \leftarrow \theta + \alpha \gamma^t G_t \nabla_\theta \ln \pi_\theta(A_t \vert S_t)$
REINFORCE 算法是一个on-policy的算法,我们在使用的时候一般会给$G_t$值减去一个基线值,来降低训练过程中的方差,稳定训练过程。一个常用的基线值就是状态价值函数$V(s)$,在实际使用的时候,我们也可以使用$G_t$的均值,也有不错的效果。
CartPole-v0 实验
这里我们使用简单的平衡杆实验来测试REINFORCE算法的功能。
算法实现
REINFORCE 算法的实现如下。
# -*- coding: utf-8 -*-
import numpy as np
import torch
from torch.distributions import Categorical
import torch.optim as optim
import numpy as np
class EpisodeData(object):
def __init__(self):
self.fields = ['states', 'actions', 'log_probs', 'rewards', 'dones']
for f in self.fields:
setattr(self, f, [])
self.total_rewards = 0
def add_record(self, state, action, reward, done, log_prob=None):
self.states.append(state)
self.actions.append(action)
self.log_probs.append(log_prob)
self.dones.append(done)
self.rewards.append(reward)
self.total_rewards += reward
def get_states(self):
return np.array(self.states)
def get_actions(self):
return np.array(self.actions)
def discount_returns(self, gamma=1.0):
r = 0
returns = []
for reward, done in zip(self.rewards[::-1], self.dones[::-1]):
if done:
r = 0
r = r * gamma + reward
returns.insert(0, r)
return np.array(returns)
def steps(self):
return len(self.states)
class REINFORCE(object):
def __init__(self,
env,
model,
lr=1e-4,
gamma=1.0,
optimizer='sgd',
device='cpu',
exploring=None,
n_trained_times=1,
baseline='mean',
tbwriter=None,
deterministic=True) -> None:
self.env = env
self.model = model
self.lr = lr
self.gamma = gamma
if optimizer == 'adam':
self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
elif optimizer == 'sgd':
self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
self.device = device
# self.exploring = 'quadratic_decrease'
self.exploring = exploring
self.n_trained_times = n_trained_times
self.tbwriter = tbwriter
self.baseline = baseline
self.deterministic = deterministic
def get_exploring(self, mexp=0.1):
if isinstance(self.exploring, float):
return self.exploring
elif self.exploring == 'quadratic_decrease':
return max(0.1, self.n_trained_times**(-0.5))
return 0.01
def gen_epoch_data(self, n_steps=1024):
state = self.env.reset()
done = False
episode_data = EpisodeData()
self.model.eval()
steps = 0
exploring = self.get_exploring()
for _ in range(n_steps):
steps += 1
action_prob = self.model(
torch.tensor(state[np.newaxis, :]).float())
policy = Categorical(action_prob)
if not self.deterministic and np.random.rand() <= exploring:
action = self.env.action_space.sample()
else:
action = policy.sample().detach().item()
next_state, reward, done, _ = self.env.step(int(action))
if done:
reward -= 10
episode_data.add_record(state, action, reward, done)
state = next_state
if done:
state = self.env.reset()
return episode_data
def train(self, actions, states, returns, discounts):
self.n_trained_times += 1
actions = torch.tensor(actions)
states = torch.tensor(states).float()
returns = torch.tensor(returns).float()
discounts = torch.tensor(discounts).float()
self.model.train()
probs = self.model(states)
policy = Categorical(probs)
log_probs = policy.log_prob(actions)
loss = -torch.mean(log_probs * returns * discounts)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
def learning(self, n_epoches=100):
decay_reward = 0
decay = 0.95
for n in range(n_epoches):
episode_data = self.gen_epoch_data()
n_steps = episode_data.steps()
returns = episode_data.discount_returns(gamma=self.gamma)
# print(returns, episode_data.rewards)
# break
discounts = np.full(n_steps, self.gamma)
if self.gamma != 1.0:
for i in range(1, n_steps):
discounts[i] = self.gamma**(1 + i)
if self.baseline == 'mean':
returns -= returns.mean()
loss = self.train(actions=episode_data.get_actions(),
states=episode_data.get_states(),
returns=returns,
discounts=discounts)
if decay_reward == 0:
decay_reward = episode_data.total_rewards
else:
decay_reward = decay_reward * decay + episode_data.total_rewards * (
1 - decay)
if self.tbwriter:
self.tbwriter.add_scalar('training loss', loss, n + 1)
self.tbwriter.add_scalar('decay reward', decay_reward, n + 1)
if n % 10 == 0:
print(f'round: {n:>3d} | loss: {loss:>5.3f} | '
f'pre reward: {decay_reward:>5.2f}')
REINFORCE 算法简洁,实现起来也很容易。上述算法实现主要实现了两个功能:
- 数据的生成:此处的实现与论文中有一点冲突,就是REINFORCE算法一般在使用的时候需要完整的一轮数据(也就是从开始运行一直到环境达到终止状态),此处我们直接指定了生成多少步数据。大家如果想要看使用完整数据的效果,可自行更改。为了让代码看起来更容易理解,我们将每轮生成的数据保存在了类
EpisodeData
中。 - 训练过程:算法的训练我们直接使用了pytorch进行,这样我们就无需进行梯度的计算了,非常方便。此处算法中,我们可以使用$G_t$的均值作为基线值。另外,这里我们设置了$\gamma$作为收益的衰减系数,默认的衰减系数为1,也就是不衰减,在CartPole任务中,使用参数1效果也不错。但是,在其它一些复杂任务中,使用小于1的衰减系数可能更容易收敛。
训练代码
平衡杆实验,我们直接使用 openai 的 gym 环境中的 CartPole-v0 强化学习环境。
# -*- coding: utf-8 -*-
import os
import gym
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
class CP0Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc = nn.Sequential(
nn.Linear(4, 32),
nn.ReLU(),
nn.Linear(32, 32),
nn.ReLU(),
nn.Linear(32, 2),
)
self._initialize_weights()
def forward(self, x):
return torch.softmax(self.fc(x), dim=1)
def _initialize_weights(self):
for module in self.modules():
if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
nn.init.normal_(module.weight, 0, 0.01)
nn.init.normal_(module.bias)
if __name__ == '__main__':
env = gym.make('CartPole-v0')
model = CP0Model()
writer = SummaryWriter('tb/reinforce-cartpole-v0')
reinforce = REINFORCE(env=env,
model=model,
lr=1e-3,
optimizer='adam',
gamma=0.95,
tbwriter=writer,
baseline='mean',
deterministic=False)
reinforce.learning(5000)
model_base = 'models/'
if not os.path.exists(model_base):
os.makedirs(model_base)
torch.save(model.state_dict(),
'models/cartpole-v0-brand-new-baseline-random.pt')
默认参数下我们每一轮使用1024步的数据,此参数可以通过初始化类REINFORCE
的n_steps
参数来修改。这里我们给出训练1000轮的输出。
round: 0 | loss: -0.025 | pre reward: 404.00
round: 10 | loss: -0.006 | pre reward: 429.65
round: 20 | loss: 0.051 | pre reward: 468.72
round: 30 | loss: -0.003 | pre reward: 500.54
round: 40 | loss: 0.025 | pre reward: 523.58
round: 50 | loss: -0.023 | pre reward: 549.84
round: 60 | loss: -0.006 | pre reward: 562.69
round: 70 | loss: -0.031 | pre reward: 569.05
round: 80 | loss: -0.064 | pre reward: 564.48
round: 90 | loss: -0.010 | pre reward: 565.44
round: 100 | loss: 0.004 | pre reward: 569.87
round: 110 | loss: 0.039 | pre reward: 570.39
round: 120 | loss: -0.004 | pre reward: 571.03
round: 130 | loss: 0.085 | pre reward: 577.27
round: 140 | loss: -0.012 | pre reward: 583.53
round: 150 | loss: -0.017 | pre reward: 581.73
round: 160 | loss: 0.012 | pre reward: 588.51
round: 170 | loss: 0.027 | pre reward: 593.25
round: 180 | loss: -0.019 | pre reward: 590.32
round: 190 | loss: -0.062 | pre reward: 595.40
round: 200 | loss: 0.012 | pre reward: 597.81
round: 210 | loss: 0.024 | pre reward: 600.44
round: 220 | loss: -0.030 | pre reward: 612.86
round: 230 | loss: 0.098 | pre reward: 617.62
round: 240 | loss: -0.014 | pre reward: 623.26
round: 250 | loss: -0.017 | pre reward: 638.83
round: 260 | loss: 0.043 | pre reward: 659.33
round: 270 | loss: 0.008 | pre reward: 661.90
round: 280 | loss: 0.169 | pre reward: 660.35
round: 290 | loss: -0.036 | pre reward: 676.20
round: 300 | loss: 0.144 | pre reward: 693.86
round: 310 | loss: -0.060 | pre reward: 713.35
round: 320 | loss: 0.102 | pre reward: 721.79
round: 330 | loss: -0.001 | pre reward: 737.35
round: 340 | loss: 0.108 | pre reward: 748.43
round: 350 | loss: 0.007 | pre reward: 750.21
round: 360 | loss: 0.072 | pre reward: 767.98
round: 370 | loss: -0.021 | pre reward: 778.25
round: 380 | loss: 0.101 | pre reward: 791.54
round: 390 | loss: -0.036 | pre reward: 799.65
round: 400 | loss: 0.061 | pre reward: 812.93
round: 410 | loss: 0.008 | pre reward: 819.22
round: 420 | loss: 0.051 | pre reward: 825.24
round: 430 | loss: 0.082 | pre reward: 827.40
round: 440 | loss: 0.102 | pre reward: 830.91
round: 450 | loss: 0.086 | pre reward: 844.24
round: 460 | loss: 0.017 | pre reward: 843.69
round: 470 | loss: 0.071 | pre reward: 846.08
round: 480 | loss: 0.064 | pre reward: 853.37
round: 490 | loss: -0.011 | pre reward: 856.95
round: 500 | loss: -0.009 | pre reward: 859.55
round: 510 | loss: 0.021 | pre reward: 865.09
round: 520 | loss: 0.053 | pre reward: 868.20
round: 530 | loss: 0.024 | pre reward: 864.45
round: 540 | loss: 0.062 | pre reward: 867.47
round: 550 | loss: -0.024 | pre reward: 869.43
round: 560 | loss: 0.072 | pre reward: 865.47
round: 570 | loss: 0.013 | pre reward: 874.20
round: 580 | loss: 0.058 | pre reward: 881.78
round: 590 | loss: 0.075 | pre reward: 885.03
round: 600 | loss: 0.046 | pre reward: 888.89
round: 610 | loss: 0.068 | pre reward: 893.18
round: 620 | loss: 0.030 | pre reward: 891.80
round: 630 | loss: -0.006 | pre reward: 892.69
round: 640 | loss: 0.076 | pre reward: 894.49
round: 650 | loss: 0.062 | pre reward: 897.99
round: 660 | loss: 0.061 | pre reward: 899.22
round: 670 | loss: 0.063 | pre reward: 903.39
round: 680 | loss: 0.044 | pre reward: 901.44
round: 690 | loss: 0.047 | pre reward: 899.06
round: 700 | loss: 0.015 | pre reward: 895.08
round: 710 | loss: 0.040 | pre reward: 893.63
round: 720 | loss: 0.054 | pre reward: 898.64
round: 730 | loss: -0.006 | pre reward: 903.43
round: 740 | loss: 0.055 | pre reward: 906.43
round: 750 | loss: 0.058 | pre reward: 907.97
round: 760 | loss: 0.005 | pre reward: 906.76
round: 770 | loss: 0.050 | pre reward: 910.49
round: 780 | loss: 0.055 | pre reward: 913.97
round: 790 | loss: 0.066 | pre reward: 914.40
round: 800 | loss: 0.039 | pre reward: 915.44
round: 810 | loss: 0.063 | pre reward: 916.50
round: 820 | loss: 0.056 | pre reward: 919.64
round: 830 | loss: 0.043 | pre reward: 923.71
round: 840 | loss: 0.042 | pre reward: 927.13
round: 850 | loss: 0.058 | pre reward: 926.24
round: 860 | loss: 0.049 | pre reward: 931.58
round: 870 | loss: 0.043 | pre reward: 933.46
round: 880 | loss: 0.035 | pre reward: 935.12
round: 890 | loss: 0.039 | pre reward: 935.45
round: 900 | loss: 0.038 | pre reward: 938.63
round: 910 | loss: 0.042 | pre reward: 938.23
round: 920 | loss: 0.034 | pre reward: 936.26
round: 930 | loss: 0.048 | pre reward: 928.37
round: 940 | loss: 0.049 | pre reward: 923.11
round: 950 | loss: -0.067 | pre reward: 918.82
round: 960 | loss: -0.099 | pre reward: 924.75
round: 970 | loss: 0.047 | pre reward: 936.96
round: 980 | loss: 0.040 | pre reward: 942.90
round: 990 | loss: 0.043 | pre reward: 944.04
可以看到这里reward在稳定增长(reward 采用了一个滑动平均的计算方法)。
查看效果
下面我们就可以直接使用CartPole-v0环境来实验一下我们保存的模型,看看怎么样。可运行如下代码:
import numpy as np
import time
import gym
import torch
from torch import nn
# 从训练文件导入我们刚刚使用的模型
from train import CP0Model
def get_action(model, state):
probs = model(torch.tensor(state[np.newaxis, :]).float())
action = probs.argmax().detach()
return action.item()
if __name__ == '__main__':
model = CP0Model()
model.load_state_dict(
torch.load('./models/cartpole-v0-brand-new-baseline-random.pt'))
model.eval()
env = gym.make('CartPole-v0')
done = False
n_restart = 0
try:
state = env.reset()
total_reward = 0.
for _ in range(2000):
env.render()
if done:
print(f'done, total reward: {total_reward}')
state = env.reset()
n_restart += 1
total_reward = 0
action = get_action(model, state)
state, reward, done, _ = env.step(action) # take a random action
total_reward += reward
print(f'restart: {n_restart}')
except Exception as e:
print(e)
env.close()
仅训练1000轮的效果较为一般,训练到2000轮以上应该就有不错的效果了。
下面是训练好的模型效果(效果比较好,摆动的幅度比较小):
参考
[1] Williams, Ronald J. "Simple statistical gradient-following algorithms for connectionist reinforcement learning." Machine learning 8.3 (1992): 229-256.
[2] 策略梯度算法专题:https://paperexplained.cn/articles/article/sdetail/36ae615c-6811-442e-99f6-a96465dc04a7/
扫码关注我们的微信公众号,可获取最新推送哦~~
Table of Contents
More Recommendations