Source code for cntk.contrib.deeprl.agent.tabular_qlearning

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Tabular Q-learning."""

import copy

import numpy as np

from .agent import AgentBaseClass
from .shared.qlearning_parameters import QLearningParameters


[docs]class TabularQLearning(AgentBaseClass):
    """Q-learning agent with tabular representation."""

    def __init__(self, cfg_filename, o_space, a_space):
        """Constructor for Q learning algorithm with tabular representation."""
        super(TabularQLearning, self).__init__(o_space, a_space)

        self._parameters = QLearningParameters(cfg_filename)
        if self._parameters.q_representation != 'tabular':
            raise ValueError(
                'Unexpected representation for tabular Q-learning: "{0}"'
                '\n'.format(self._parameters.q_representation))

        # Discretize the observation space if necessary
        if self._classname(o_space) != 'gym.spaces.discrete.Discrete':
            self._discretize_observation_space(
                o_space, self._parameters.discretization_resolution)

        self._q = self._parameters.initial_q + \
            np.zeros((self._num_states, self._num_actions))
        print('Initialized discrete Q-learning agent with {0} states and '
              '{1} actions.'.format(self._num_states, self._num_actions))

        self.episode_count = 0
        # step_count is incremented each time after receiving reward.
        self.step_count = 0

[docs]    def start(self, state):
        """Start a new episode."""
        self._adjust_exploration_rate()
        self._last_state = self._preprocess_state(state)
        self._last_action, action_behavior = \
            self._choose_action(self._last_state)
        self.episode_count += 1
        return self._last_action, {
            'action_behavior': action_behavior,
            'epsilon': self._epsilon}

[docs]    def step(self, reward, next_state):
        """Observe one transition and choose an action."""
        self._adjust_learning_rate()
        self.step_count += 1

        next_encoded_state = self._preprocess_state(next_state)
        td_err = reward + self._parameters.gamma * \
            np.max(self._q[next_encoded_state]) - \
            self._q[self._last_state, self._last_action]
        self._q[self._last_state, self._last_action] += self._eta * td_err

        self._adjust_exploration_rate()
        self._last_state = next_encoded_state
        self._last_action, action_behavior = self._choose_action(
            self._last_state)
        return self._last_action, {
            'action_behavior': action_behavior,
            'epsilon': self._epsilon}

[docs]    def end(self, reward, next_state):
        """Last observed reward/state of the episode (which then terminates)."""
        self._adjust_learning_rate()
        self.step_count += 1

        td_err = reward - self._q[self._last_state, self._last_action]
        self._q[self._last_state, self._last_action] += self._eta * td_err

[docs]    def set_as_best_model(self):
        """Copy current model to best model."""
        self._best_model = copy.deepcopy(self._q)

[docs]    def save(self, filename):
        """Save best model to file."""
        with open(filename, 'w') as f:
            for s in range(self._num_states):
                f.write('{0}\t{1}\n'.format(s, str(self._best_model[s])))

[docs]    def save_parameter_settings(self, filename):
        """Save parameter settings to file."""
        self._parameters.save(filename)

[docs]    def enter_evaluation(self):
        """Setup before evaluation."""
        self._epsilon = 0

    def _adjust_learning_rate(self):
        self._eta = self._parameters.eta_minimum + max(
            0,
            (self._parameters.initial_eta - self._parameters.eta_minimum) *
            (1 - float(self.step_count)/self._parameters.eta_decay_step_count))

    def _adjust_exploration_rate(self):
        self._epsilon = self._parameters.epsilon_minimum + max(
            0,
            (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) *
            (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count))

    def _choose_action(self, state):
        """Epsilon greedy policy."""
        if np.random.uniform(0, 1) < self._epsilon:
            return np.random.randint(self._num_actions), 'RANDOM'
        else:
            return np.argmax(self._q[state]), 'GREEDY'

    def _preprocess_state(self, state):
        """Discretize state to table row index."""
        o = self._discretize_state_if_necessary(state)
        return o