Source code for cntk.contrib.deeprl.agent.tabular_qlearning

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Tabular Q-learning."""

import copy

import numpy as np

from .agent import AgentBaseClass
from .shared.qlearning_parameters import QLearningParameters


[docs]class TabularQLearning(AgentBaseClass): """Q-learning agent with tabular representation.""" def __init__(self, cfg_filename, o_space, a_space): """Constructor for Q learning algorithm with tabular representation.""" super(TabularQLearning, self).__init__(o_space, a_space) self._parameters = QLearningParameters(cfg_filename) if self._parameters.q_representation != 'tabular': raise ValueError( 'Unexpected representation for tabular Q-learning: "{0}"' '\n'.format(self._parameters.q_representation)) # Discretize the observation space if necessary if self._classname(o_space) != 'gym.spaces.discrete.Discrete': self._discretize_observation_space( o_space, self._parameters.discretization_resolution) self._q = self._parameters.initial_q + \ np.zeros((self._num_states, self._num_actions)) print('Initialized discrete Q-learning agent with {0} states and ' '{1} actions.'.format(self._num_states, self._num_actions)) self.episode_count = 0 # step_count is incremented each time after receiving reward. self.step_count = 0
[docs] def start(self, state): """Start a new episode.""" self._adjust_exploration_rate() self._last_state = self._preprocess_state(state) self._last_action, action_behavior = \ self._choose_action(self._last_state) self.episode_count += 1 return self._last_action, { 'action_behavior': action_behavior, 'epsilon': self._epsilon}
[docs] def step(self, reward, next_state): """Observe one transition and choose an action.""" self._adjust_learning_rate() self.step_count += 1 next_encoded_state = self._preprocess_state(next_state) td_err = reward + self._parameters.gamma * \ np.max(self._q[next_encoded_state]) - \ self._q[self._last_state, self._last_action] self._q[self._last_state, self._last_action] += self._eta * td_err self._adjust_exploration_rate() self._last_state = next_encoded_state self._last_action, action_behavior = self._choose_action( self._last_state) return self._last_action, { 'action_behavior': action_behavior, 'epsilon': self._epsilon}
[docs] def end(self, reward, next_state): """Last observed reward/state of the episode (which then terminates).""" self._adjust_learning_rate() self.step_count += 1 td_err = reward - self._q[self._last_state, self._last_action] self._q[self._last_state, self._last_action] += self._eta * td_err
[docs] def set_as_best_model(self): """Copy current model to best model.""" self._best_model = copy.deepcopy(self._q)
[docs] def save(self, filename): """Save best model to file.""" with open(filename, 'w') as f: for s in range(self._num_states): f.write('{0}\t{1}\n'.format(s, str(self._best_model[s])))
[docs] def save_parameter_settings(self, filename): """Save parameter settings to file.""" self._parameters.save(filename)
[docs] def enter_evaluation(self): """Setup before evaluation.""" self._epsilon = 0
def _adjust_learning_rate(self): self._eta = self._parameters.eta_minimum + max( 0, (self._parameters.initial_eta - self._parameters.eta_minimum) * (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) def _adjust_exploration_rate(self): self._epsilon = self._parameters.epsilon_minimum + max( 0, (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) * (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count)) def _choose_action(self, state): """Epsilon greedy policy.""" if np.random.uniform(0, 1) < self._epsilon: return np.random.randint(self._num_actions), 'RANDOM' else: return np.argmax(self._q[state]), 'GREEDY' def _preprocess_state(self, state): """Discretize state to table row index.""" o = self._discretize_state_if_necessary(state) return o