点击跳转原书代码链接
1 初识RL
解决我电脑中pip install不了的问题
pip install (XX包) -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
下载Gym包(base环境)
pip install gym -i https://pypi.douban.com/simple
实例:小车上山(完整代码)
import gym
import numpy as np
env=gym.make('MountainCar-v0')
print('观测空间={}'.format(env.observation_space))
print('动作空间={}'.format(env.action_space))
print('观测范围={}-{}'.format(env.observation_space.low,env.observation_space.high))
print('动作数={}'.format(env.action_space.n))
class BespokeAgent:
def __init__(self,env):
pass
def decide(self,observation): # 决策
position,velocity=observation
lb=min(-0.09*(position+0.25)**2+0.03,0.3*(position+0.9)**4-0.008)
ub=-0.07*(position+0.38)**2+0.06
if lb<velocity<ub:
action=2
else:
action=0
return action # 返回动作
def learn(self,*args): # 学习
pass
agent=BespokeAgent(env)
# 交互一回合
def play_montecarlo(env, agent, render=False, train=False):
episode_reward = 0. # 记录回合总奖励,初始化为0
observation = env.reset() # 重置游戏环境,开始新回合
while True: # 不断循环,直到回合结束
if render: # 判断是否显示
env.render() # 显示图形界面,图形界面可以用 env.close() 语句关闭
action = agent.decide(observation)
next_observation, reward, done, _ = env.step(action) # 执行动作
episode_reward += reward # 收集回合奖励
if train: # 判断是否训练智能体
agent.learn(observation, action, reward, done) # 学习
if done: # 回合结束,跳出循环
break
observation = next_observation
return episode_reward # 返回回合总奖励
env.seed(0) # 设置随机数种子,只是为了让结果可以精确复现,一般情况下可删去
episode_reward = play_montecarlo(env, agent, render=True)
print('回合奖励 = {}'.format(episode_reward))
env.close() # 此语句可关闭图形界面
episode_rewards = [play_montecarlo(env, agent) for _ in range(100)]
print('平均回合奖励 = {}'.format(np.mean(episode_rewards)))
2 Markov 决策过程
2-1 Bellman方程求解 一个小操作
# format中变量直接替换花括号中内容
bb={'hsaj':(23,34)}
for i in bb.keys():print(bb['{}'.format(i)])
求解Bellman期望方程
import sympy
from sympy import symbols
sympy.init_printing()
# 求Bellman期望方程
v_hungry, v_full = symbols('v_hungry v_full')
q_hungry_eat, q_hungry_none, q_full_eat, q_full_none = symbols('q_hungry_eat q_hungry_none q_full_eat q_full_none')
alpha, beta, gamma = symbols('alpha beta gamma')
x, y = symbols('x y')
system = sympy.Matrix((
(1, 0, x-1, -x, 0, 0, 0),
(0, 1, 0, 0, -y, y-1, 0),
(-gamma, 0, 1, 0, 0, 0, -2),
((alpha-1)*gamma, -alpha*gamma, 0, 1, 0, 0, 4*alpha-3),
(-beta*gamma, (beta-1)*gamma, 0, 0, 1, 0, -4*beta+2),
(0, -gamma, 0, 0, 0, 1, 1) ))
solve_dict=sympy.solve_linear_system(system,v_hungry, v_full, q_hungry_none, q_hungry_eat, q_full_none, q_full_eat)
for i in solve_dict.items():
print(i)
求解Bellman最优方程
import sympy
from sympy import symbols
sympy.init_printing()
# 求Bellman期望方程
v_hungry, v_full = symbols('v_hungry v_full')
q_hungry_eat, q_hungry_none, q_full_eat, q_full_none = symbols('q_hungry_eat q_hungry_none q_full_eat q_full_none')
alpha, beta, gamma = symbols('alpha beta gamma')
x, y = symbols('x y')
xy_tuples = ((0, 0), (1, 0), (0, 1), (1, 1))
for x, y in xy_tuples:
system = sympy.Matrix((
(1, 0, x-1, -x, 0, 0, 0),
(0, 1, 0, 0, -y, y-1, 0),
(-gamma, 0, 1, 0, 0, 0, -2),
((alpha-1)*gamma, -alpha*gamma, 0, 1, 0, 0, 4*alpha-3),
(-beta*gamma, (beta-1)*gamma, 0, 0, 1, 0, -4*beta+2),
(0, -gamma, 0, 0, 0, 1, 1) ))
result = sympy.solve_linear_system(system, v_hungry, v_full,
q_hungry_none, q_hungry_eat, q_full_none, q_full_eat, simplification=True)
msgx = 'v(饿) = q(饿,{}吃)'.format('' if x else '不')
msgy = 'v(饱) = q(饱,{}吃)'.format('不' if y else '')
print('==== {}, {} ==== x = {}, y = {} ===='.format(msgx, msgy, x, y)) # 不断嵌套使用{}
print(result)
2-2 悬崖探路
import numpy as np
np.random.seed(0)
import scipy
import gym
# 引入环境
env = gym.make('CliffWalking-v0')
env.seed(0)
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('状态数量 = {}, 动作数量 = {}'.format(env.nS, env.nA))
print('地图大小 = {}'.format(env.shape))
# 运行一个回合
def play_once(env, policy):
total_reward = 0
state = env.reset() # 初始状态设置为36
while True:
loc = np.unravel_index(state, env.shape) # 返回数组中某个元素索引,返回类似(行索引,列索引)
print('状态 = {}, 位置 = {}'.format(state, loc), end=' ')
action = np.random.choice(env.nA, p=policy[state])
next_state, reward, done, _ = env.step(action)
print('动作 = {}, 奖励 = {}'.format(action, reward))
total_reward += reward
if done:
break
state = next_state
return total_reward
# 用最优策略运行一个回合
actions = np.ones(env.shape, dtype=int)
actions[-1, :] = 0 # 动作空间向量各维0,1,2,3分别表示向上,向右,向下,向左
actions[:, -1] = 2
optimal_policy = np.eye(4)[actions.reshape(-1)] # 后为筛选下标列表
total_reward = play_once(env, optimal_policy)
print('回合奖励 = {}'.format(total_reward))
求解Bellman期望方程
# 求解Bellman期望方程
def evaluate_bellman(env, policy, gamma=1.):
a, b = np.eye(env.nS), np.zeros((env.nS)) # 单位阵a,零向量b
for state in range(env.nS - 1): # 除去最后的终止状态
for action in range(env.nA):
pi = policy[state][action] # 执行策略,选取下一步动作
for p, next_state, reward, done in env.P[state][action]: # env.P用字典存储动力信息,此问题中转移动力值均为1或者0
a[state, next_state] -= (pi * gamma * p)
b[state] += (pi * reward * p)
v = np.linalg.solve(a, b) # 以矩阵形式解一个线性矩阵方程(或线性标量方程组),返回ax=b的解x
q = np.zeros((env.nS, env.nA)) # 终止状态-动作对默认价值为0
for state in range(env.nS - 1):
for action in range(env.nA):
for p, next_state, reward, done in env.P[state][action]:
q[state][action] += ((reward + gamma * v[next_state]) * p)
return v, q
# 评估随机策略的价值
policy = np.random.uniform(size=(env.nS, env.nA)) # (48,4),在(0,1)均匀分布(这里包括了终止结点,其实不影响)
policy = policy / np.sum(policy, axis=1)[:, np.newaxis] # 对行方向归一化
state_values, action_values = evaluate_bellman(env, policy)
print('状态价值 = {}'.format(state_values)) # (48)
print('动作价值 = {}'.format(action_values)) # (48,4)
# 评估最优策略的价值(我们给的)
optimal_state_values, optimal_action_values = evaluate_bellman(env, optimal_policy)
print('最优状态价值 = {}'.format(optimal_state_values))
print('最优动作价值 = {}'.format(optimal_action_values))
|