- Numpy imported
- Expected Sarsa在Grid问题中的应用
class ExpectedSarsaAgent(BaseAgent):
def agent_init(self, agent_init_info):
"""Setup for the agent called when the experiment first starts.
Args:
agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
{
num_states (int): The number of states,
num_actions (int): The number of actions,
epsilon (float): The epsilon parameter for exploration,
step_size (float): The step-size,
discount (float): The discount factor,
}
"""
self.num_actions = agent_init_info["num_actions"]
self.num_states = agent_init_info["num_states"]
self.epsilon = agent_init_info["epsilon"]
self.step_size = agent_init_info["step_size"]
self.discount = agent_init_info["discount"]
self.rand_generator = np.random.RandomState(agent_info["seed"])
self.q = np.zeros((self.num_states, self.num_actions))
def agent_start(self, observation):
"""The first method called when the episode starts, called after
the environment starts.
Args:
observation (int): the state observation from the
environment's evn_start function.
Returns:
action (int): the first action the agent takes.
"""
state = observation
current_q = self.q[state, :]
if self.rand_generator.rand() < self.epsilon:
action = self.rand_generator.randint(self.num_actions)
else:
action = self.argmax(current_q)
self.prev_state = state
self.prev_action = action
return action
def agent_step(self, reward, observation):
"""A step taken by the agent.
Args:
reward (float): the reward received for taking the last action taken
observation (int): the state observation from the
environment's step based on where the agent ended up after the
last step.
Returns:
action (int): the action the agent is taking.
"""
state = observation
current_q = self.q[state,:]
if self.rand_generator.rand() < self.epsilon:
action = self.rand_generator.randint(self.num_actions)
else:
action = self.argmax(current_q)
greedyNum = np.sum(current_q == np.max(current_q))
policy = np.ones(self.num_actions) * self.epsilon / self.num_actions
policy[np.argwhere(current_q == np.max(current_q)).flatten()] = (1 - self.epsilon)/greedyNum + self.epsilon / self.num_actions
targetPolicy = reward + self.discount * np.dot(policy, current_q)
self.q[self.prev_state, self.prev_action] = self.q[self.prev_state, self.prev_action] + self.step_size * (targetPolicy - self.q[self.prev_state, self.prev_action])
self.prev_state = state
self.prev_action = action
return action
def agent_end(self, reward):
"""Run when the agent terminates.
Args:
reward (float): the reward the agent received for entering the
terminal state.
"""
targetPolicy = reward
self.q[self.prev_state, self.prev_action] = self.q[self.prev_state, self.prev_action] + self.step_size * (targetPolicy - self.q[self.prev_state, self.prev_action])
def argmax(self, q_values):
"""argmax with random tie-breaking
Args:
q_values (Numpy array): the array of action-values
Returns:
action (int): an action with the highest value
"""
top = float("-inf")
ties = []
for i in range(len(q_values)):
if q_values[i] > top:
top = q_values[i]
ties = []
if q_values[i] == top:
ties.append(i)
return self.rand_generator.choice(ties)
|