本案例源自ML-Agents官方的示例,Github地址:https://github.com/Unity-Technologies/ml-agents
本文基于我前面发的两篇文章,需要对ML-Agents有一定的了解,详情请见:Unity强化学习之ML-Agents的使用、ML-Agents命令及配置大全。
本文主要的知识点是课程学习(Curriculum Learning)以及Model Override脚本的使用。
参考资料:ML-Agents(九)Wall Jump
环境说明
? 本案例环境较为简单,智能体的目标是到达绿色的目标点,但是中间会有堵墙相隔,墙的高度随机,当墙较矮时,智能体可以直接跳跃快速到达目标点,但是当墙体较高时,需要先推动方块进行垫脚,然后再跳过墙体。
? 和推箱子的案例类似,跳墙案例智能体的输入使用了射线传感器包括十四根射线,射线的作用在这里不再说明。
? 详情参考我前面的文章ML-Agents案例之推箱子游戏的单人模式。
? 除了使用传感器之外,agent脚本中还包含了目标到智能体的向量,和是否接触地面,这里有四维。
? 智能体的输出包含4个离散输出,第一个决定往前走,往后走,什么都不做。第二个决定往左走,在右走,什么都不做。第三个决定顺时针旋转,逆时针旋转,什么都不做。第四个决定了跳跃和什么都不做。
? 智能体脚本设定如下:
射线传感器检测的标签有三个:墙,方块,目标点。
脚本代码
本项目较简单,只有一个主要的agent脚本需要查看:
WallJumpAgent.cs
using System.Collections;
using UnityEngine;
using Unity.MLAgents;
using Unity.Barracuda;
using Unity.MLAgents.Actuators;
using Unity.MLAgents.Sensors;
using Unity.MLAgentsExamples;
public class WallJumpAgent : Agent
{
// Depending on this value, the wall will have different height
int m_Configuration;
// 三个模型应用在三个不同情况
// Brain to use when no wall is present
public NNModel noWallBrain;
// Brain to use when a jumpable wall is present
public NNModel smallWallBrain;
// Brain to use when a wall requiring a block to jump over is present
public NNModel bigWallBrain;
public GameObject ground;
public GameObject spawnArea;
Bounds m_SpawnAreaBounds;
public GameObject goal;
public GameObject shortBlock;
public GameObject wall;
Rigidbody m_ShortBlockRb;
Rigidbody m_AgentRb;
Material m_GroundMaterial;
Renderer m_GroundRenderer;
WallJumpSettings m_WallJumpSettings;
public float jumpingTime;
public float jumpTime;
// This is a downward force applied when falling to make jumps look
// less floaty
public float fallingForce;
// Use to check the coliding objects
public Collider[] hitGroundColliders = new Collider[3];
Vector3 m_JumpTargetPos;
Vector3 m_JumpStartingPos;
// 配置文件中设定的三个模型名字,实际这里只设定两个
string m_NoWallBehaviorName = "SmallWallJump";
string m_SmallWallBehaviorName = "SmallWallJump";
string m_BigWallBehaviorName = "BigWallJump";
EnvironmentParameters m_ResetParams;
// 初始化
public override void Initialize()
{
// 寻找设定参数的脚本
m_WallJumpSettings = FindObjectOfType<WallJumpSettings>();
// 随机墙的高度
m_Configuration = Random.Range(0, 5);
// 获取物体中的各种组件
m_AgentRb = GetComponent<Rigidbody>();
m_ShortBlockRb = shortBlock.GetComponent<Rigidbody>();
m_SpawnAreaBounds = spawnArea.GetComponent<Collider>().bounds;
m_GroundRenderer = ground.GetComponent<Renderer>();
m_GroundMaterial = m_GroundRenderer.material;
// 使生成区域不可见
spawnArea.SetActive(false);
// 获取配置参数
m_ResetParams = Academy.Instance.EnvironmentParameters;
// Update model references if we're overriding
var modelOverrider = GetComponent<ModelOverrider>();
// 导入三个模型,采用配置文件的参数
if (modelOverrider.HasOverrides)
{
noWallBrain = modelOverrider.GetModelForBehaviorName(m_NoWallBehaviorName);
m_NoWallBehaviorName = ModelOverrider.GetOverrideBehaviorName(m_NoWallBehaviorName);
smallWallBrain = modelOverrider.GetModelForBehaviorName(m_SmallWallBehaviorName);
m_SmallWallBehaviorName = ModelOverrider.GetOverrideBehaviorName(m_SmallWallBehaviorName);
bigWallBrain = modelOverrider.GetModelForBehaviorName(m_BigWallBehaviorName);
m_BigWallBehaviorName = ModelOverrider.GetOverrideBehaviorName(m_BigWallBehaviorName);
}
}
// 设定跳跃参数
public void Jump()
{
jumpingTime = 0.2f;
m_JumpStartingPos = m_AgentRb.position;
}
// 判断是否接触地面,里面包含两种方式
public bool DoGroundCheck(bool smallCheck)
{
if (!smallCheck)
{
hitGroundColliders = new Collider[3];
var o = gameObject;
// 这里给智能体设定一个盒子碰撞体,找到与其碰撞的碰撞体,返回
// 参数分别是碰撞体中心,碰撞体每个维度的大小,返回的碰撞体,盒子的旋转。
Physics.OverlapBoxNonAlloc(
o.transform.position + new Vector3(0, -0.05f, 0),
new Vector3(0.95f / 2f, 0.5f, 0.95f / 2f),
hitGroundColliders,
o.transform.rotation);
var grounded = false;
//如果检测到对应标签,则判断触地
foreach (var col in hitGroundColliders)
{
if (col != null && col.transform != transform &&
(col.CompareTag("walkableSurface") ||
col.CompareTag("block") ||
col.CompareTag("wall")))
{
grounded = true; //then we're grounded
break;
}
}
return grounded;
}
else
{
// 射线检测方式检测触地
RaycastHit hit;
// 智能体往正下方发生射线,没有完全触地时会判断触
Physics.Raycast(transform.position + new Vector3(0, -0.05f, 0), -Vector3.up, out hit,
1f);
// 射线检测到对应标签的物体代表触地
if (hit.collider != null &&
(hit.collider.CompareTag("walkableSurface") ||
hit.collider.CompareTag("block") ||
hit.collider.CompareTag("wall"))
&& hit.normal.y > 0.95f)
{
return true;
}
return false;
}
}
// 顺滑地移动物体
// 参数代表:终点,刚体,目标速度,最大速度
void MoveTowards(Vector3 targetPos, Rigidbody rb, float targetVel, float maxVel)
{
var moveToPos = targetPos - rb.worldCenterOfMass;
var velocityTarget = Time.fixedDeltaTime * targetVel * moveToPos;
if (float.IsNaN(velocityTarget.x) == false)
{
// 在现在值和目标值中间选择一个值,不高于maxVel
rb.velocity = Vector3.MoveTowards(rb.velocity, velocityTarget, maxVel);
}
}
// 输入设定
public override void CollectObservations(VectorSensor sensor)
{
var agentPos = m_AgentRb.position - ground.transform.position;
// 输入目标到智能体的向量
sensor.AddObservation(agentPos / 20f);
// 输入是否触地
sensor.AddObservation(DoGroundCheck(true) ? 1 : 0);
}
// 在生成区域旋转一个位置生成方块
public Vector3 GetRandomSpawnPos()
{
var randomPosX = Random.Range(-m_SpawnAreaBounds.extents.x,
m_SpawnAreaBounds.extents.x);
var randomPosZ = Random.Range(-m_SpawnAreaBounds.extents.z,
m_SpawnAreaBounds.extents.z);
var randomSpawnPos = spawnArea.transform.position +
new Vector3(randomPosX, 0.45f, randomPosZ);
return randomSpawnPos;
}
// 转换材质
IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
{
m_GroundRenderer.material = mat;
yield return new WaitForSeconds(time); //wait for 2 sec
m_GroundRenderer.material = m_GroundMaterial;
}
// 根据输出的值确定移动方式
public void MoveAgent(ActionSegment<int> act)
{
AddReward(-0.0005f);
// 两种不同的检查方式(完全触地或离地较低)
var smallGrounded = DoGroundCheck(true);
var largeGrounded = DoGroundCheck(false);
var dirToGo = Vector3.zero;
var rotateDir = Vector3.zero;
// 四个离散输出的赋值
var dirToGoForwardAction = act[0];
var rotateDirAction = act[1];
var dirToGoSideAction = act[2];
var jumpAction = act[3];
// 根据离散输出计算响应的动作
if (dirToGoForwardAction == 1)
dirToGo = (largeGrounded ? 1f : 0.5f) * 1f * transform.forward;
else if (dirToGoForwardAction == 2)
dirToGo = (largeGrounded ? 1f : 0.5f) * -1f * transform.forward;
if (rotateDirAction == 1)
rotateDir = transform.up * -1f;
else if (rotateDirAction == 2)
rotateDir = transform.up * 1f;
if (dirToGoSideAction == 1)
dirToGo = (largeGrounded ? 1f : 0.5f) * -0.6f * transform.right;
else if (dirToGoSideAction == 2)
dirToGo = (largeGrounded ? 1f : 0.5f) * 0.6f * transform.right;
if (jumpAction == 1)
if ((jumpingTime <= 0f) && smallGrounded)
{
Jump();
}
// 执行动作
transform.Rotate(rotateDir, Time.fixedDeltaTime * 300f);
m_AgentRb.AddForce(dirToGo * m_WallJumpSettings.agentRunSpeed,
ForceMode.VelocityChange);
// 跳跃尚未结束,往最高点移动
if (jumpingTime > 0f)
{
m_JumpTargetPos =
new Vector3(m_AgentRb.position.x,
m_JumpStartingPos.y + m_WallJumpSettings.agentJumpHeight,
m_AgentRb.position.z) + dirToGo;
MoveTowards(m_JumpTargetPos, m_AgentRb, m_WallJumpSettings.agentJumpVelocity,
m_WallJumpSettings.agentJumpVelocityMaxChange);
}
// 在空中下坠时添加一个下坠的力
if (!(jumpingTime > 0f) && !largeGrounded)
{
m_AgentRb.AddForce(
Vector3.down * fallingForce, ForceMode.Acceleration);
}
jumpingTime -= Time.fixedDeltaTime;
}
public override void OnActionReceived(ActionBuffers actionBuffers)
{
MoveAgent(actionBuffers.DiscreteActions);
// 智能体或方块掉下平台,扣分,重开
if ((!Physics.Raycast(m_AgentRb.position, Vector3.down, 20))
|| (!Physics.Raycast(m_ShortBlockRb.position, Vector3.down, 20)))
{
SetReward(-1f);
EndEpisode();
ResetBlock(m_ShortBlockRb);
StartCoroutine(
GoalScoredSwapGroundMaterial(m_WallJumpSettings.failMaterial, .5f));
}
}
public override void Heuristic(in ActionBuffers actionsOut)
{
var discreteActionsOut = actionsOut.DiscreteActions;
if (Input.GetKey(KeyCode.D))
{
discreteActionsOut[1] = 2;
}
if (Input.GetKey(KeyCode.W))
{
discreteActionsOut[0] = 1;
}
if (Input.GetKey(KeyCode.A))
{
discreteActionsOut[1] = 1;
}
if (Input.GetKey(KeyCode.S))
{
discreteActionsOut[0] = 2;
}
discreteActionsOut[3] = Input.GetKey(KeyCode.Space) ? 1 : 0;
}
// 接触到目标点,加分,重开
void OnTriggerStay(Collider col)
{
if (col.gameObject.CompareTag("goal") && DoGroundCheck(true))
{
SetReward(1f);
EndEpisode();
StartCoroutine(
GoalScoredSwapGroundMaterial(m_WallJumpSettings.goalScoredMaterial, 2));
}
}
// 重置物块
void ResetBlock(Rigidbody blockRb)
{
blockRb.transform.position = GetRandomSpawnPos();
blockRb.velocity = Vector3.zero;
blockRb.angularVelocity = Vector3.zero;
}
// 游戏开始时重置物块和智能体的位置
public override void OnEpisodeBegin()
{
ResetBlock(m_ShortBlockRb);
transform.localPosition = new Vector3(
18 * (Random.value - 0.5f), 1, -12);
m_Configuration = Random.Range(0, 5);
m_AgentRb.velocity = default(Vector3);
}
void FixedUpdate()
{
// 每个episode设定一次墙高度
if (m_Configuration != -1)
{
ConfigureAgent(m_Configuration);
m_Configuration = -1;
}
}
// 设定墙高度,根据高度设定不同的模型
void ConfigureAgent(int config)
{
var localScale = wall.transform.localScale;
if (config == 0)
{
localScale = new Vector3(
localScale.x,
m_ResetParams.GetWithDefault("no_wall_height", 0),
localScale.z);
wall.transform.localScale = localScale;
// 设定对应配置文件的Behavior Name以及对应模型
SetModel(m_NoWallBehaviorName, noWallBrain);
}
else if (config == 1)
{
// 在没有课程训练(Curriculum Learning)的配置下,值默认为4
// 在有课程训练的配置下,墙的高度直接设为配置中的值,而这个值会随着训练而改变
localScale = new Vector3(
localScale.x,
m_ResetParams.GetWithDefault("small_wall_height", 4),
localScale.z);
wall.transform.localScale = localScale;
SetModel(m_SmallWallBehaviorName, smallWallBrain);
}
else
{
var height = m_ResetParams.GetWithDefault("big_wall_height", 8);
localScale = new Vector3(
localScale.x,
height,
localScale.z);
wall.transform.localScale = localScale;
SetModel(m_BigWallBehaviorName, bigWallBrain);
}
}
}
注意这个案例终于用到了Override这个脚本,它的作用是在训练中也能通过SetModel(m_BigWallBehaviorName, bigWallBrain)替换模型,其中第一个参数是配置文件中的Behavior Name,第二个就是对应的模型文件。这样我们就能在不同的情况采用不同的模型了。
配置文件
配置文件1:PPO算法,于以往不同,可以看到我们配置了两个神经网络,一个名字是BigWallJump,一个是SmallWallJump。这是因为我们的环境需要给智能体配置两个神经网络,根据不同的情况调用不同的神经网络分别进行学习。因为一个神经网络如果要做的事情太多往往需要的训练时间很长,很可能train不起来。如果分成几个任务,适时调用不同的神经网络,不仅可以加快训练速度,还能达到更好的效果。
亲测这种算法在高墙时无法收敛,没办法利用方块,平均得分-0.6。
behaviors:
BigWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 128
summary_freq: 20000
SmallWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
配置文件2:SAC算法
亲测这种算法在高墙时无法收敛,没办法利用方块。且效果比PPO差,平均得分-1。
behaviors:
BigWallJump:
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 200000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 0.1
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 15000000
time_horizon: 128
summary_freq: 20000
SmallWallJump:
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 0.1
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
配置文件3:
接下来就是课程学习(Curriculum Learning)
behaviors:
BigWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 128
summary_freq: 20000
SmallWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
environment_parameters:
big_wall_height:
curriculum:
- name: Lesson0
completion_criteria:
measure: progress
behavior: BigWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.1
value:
sampler_type: uniform
sampler_parameters:
min_value: 0.0
max_value: 4.0
- name: Lesson1
completion_criteria:
measure: progress
behavior: BigWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.3
value:
sampler_type: uniform
sampler_parameters:
min_value: 4.0
max_value: 7.0
- name: Lesson2
completion_criteria:
measure: progress
behavior: BigWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.5
value:
sampler_type: uniform
sampler_parameters:
min_value: 6.0
max_value: 8.0
- name: Lesson3
value: 8.0
small_wall_height:
curriculum:
- name: Lesson0
completion_criteria:
measure: progress
behavior: SmallWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.1
value: 1.5
- name: Lesson1
completion_criteria:
measure: progress
behavior: SmallWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.3
value: 2.0
- name: Lesson2
completion_criteria:
measure: progress
behavior: SmallWallJump
signal_smoothing: true
min_lesson_length: 100
threshold: 0.5
value: 2.5
- name: Lesson3
value: 4.0
可以看到前半部分和普通的PPO没有区别,区别在environment_parameters:之后,这里定义了课程学习的参数。
课程学习的原理是循序渐进,当一项任务很难的情况下,人们往往会定下一个个小目标,慢慢提升自己,如果想要一下子反而不太现实,就像前面的算法无法达成目标一样。因此在这个任务中我们可以让墙体慢慢升高来达到训练目的。
对于两个网络,我们都把其分为四个训练过程,Lesson0,Lesson1,Lesson2,Lesson3,Lesson4。
其中的参数含义为:
- measure:衡量课程进度的方法。其中的值为reward指的是用奖励衡量,progress指的是用steps/max_steps比例来衡量。
- threshold:配合measure使用,到达这个值时会自动转向课程的下一个阶段。
- min_lesson_length:在转换阶段之前,需要完成的episode的最低数量。
- signal_smoothing:是否通过以前的值来衡量当前的进度(进行加权)。
- 我们可以对每个阶段设定不同的值value,value可以是变化的,sampler_type是采样类型,min_value是采样的最小值,max_value是采样的最大值,sampler_type为uniform在最大值和最小值间均匀采用,为Gaussian是采用正态分布采样。
老版配置方式:
BigWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
亲测这种算法最终平均得分0.7,存在少量完不成任务的情况,但效果比上面两种要好得多,可以再对课程进行细分来提高得分。
训练命令:
mlagents-learn config/ppo/WallJump.yaml --run-id=WallJump --force
总结
相对于以前的案例,这个案例的新颖之处在于:
- 跳跃的控制,设定了两个触地判定,在下落时给加速度(我的预想是跳跃时给一个向上速度,然后交给物理引擎)
- 多模型训练,设定三种模型,对应三种情况,学习了ModelOverride脚本的使用。
- 探索了课程学习(Curriculum Learning)对训练的影响,学习对应的文件配置。
|