Source code for cntk.contrib.deeprl.agent.shared.qlearning_parameters

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Q learning parameters."""

import numpy as np

import ast
import configparser


[docs]class QLearningParameters:
    """Parameters used by Q learning algorithm."""

    def __init__(self, config_file):
        """Read parameter values from config_file.

        Use default value if the value is not present.
        """
        # TODO: validate parameter values.
        self.config = configparser.ConfigParser()
        self.config.optionxform = str
        self.config.read(config_file)

        # Discount factor
        self.gamma = self.config.getfloat(
            'General', 'Gamma', fallback=0.95)

        # Name of class that does preprocessing.
        self.preprocessing = self.config.get(
            'General', 'PreProcessing', fallback='')

        # Arguments (except the first argument input_shape) of preprocessing as
        # a tuple.
        self.preprocessing_args = self.config.get(
            'General', 'PreProcessingArgs', fallback='()')

        # Representation of Q function, taking value from {'tabular', 'nn'}.
        self.q_representation = self.config.get(
            'QLearningAlgo', 'QRepresentation', fallback='tabular')

        # Initial value of epsilon (exploration rate), used by epsilon-greedy
        # policy.
        self.initial_epsilon = self.config.getfloat(
            'QLearningAlgo', 'InitialEpsilon', fallback=0.1)

        # Number of steps before epsilon reaches minimum value.
        self.epsilon_decay_step_count = self.config.getint(
            'QLearningAlgo', 'EpsilonDecayStepCount', fallback=100000)

        # Minimum value of epsilon.
        self.epsilon_minimum = self.config.getfloat(
            'QLearningAlgo', 'EpsilonMinimum', fallback=0.01)

        # Initial value of eta, which is the learning rate for gradient
        # descent.
        self.initial_eta = self.config.getfloat(
            'Optimization', 'InitialEta', fallback=0.001)

        # Number of steps before eta reaches minimum value.
        self.eta_decay_step_count = self.config.getint(
            'Optimization', 'EtaDecayStepCount', fallback=100000)

        # Minimum value of eta. Since Adam is used as the optimizer, a good
        # starting point is to set EtaMinimum equal to InitialEta, which is
        # equivalent to using a constant learning rate.
        self.eta_minimum = self.config.getfloat(
            'Optimization', 'EtaMinimum', fallback=0.001)

        # Momentum used by RMSProp.
        self.momentum = self.config.getfloat(
            'Optimization', 'Momentum', fallback=0.95)

        # Initial value for table entries.
        # TODO(maoyi): allow DQN initialization through config file.
        self.initial_q = self.config.getfloat(
            'QLearningAlgo', 'InitialQ', fallback=0.0)

        # Number of partitions for discretizing the continuous space. Either a
        # scalar which is applied to all dimensions, or a list specifying
        # different value for different dimension.
        self.discretization_resolution = ast.literal_eval(self.config.get(
            'QLearningAlgo', 'DiscretizationResolution', fallback='10'))
        if isinstance(self.discretization_resolution, list):
            self.discretization_resolution = np.array(
                self.discretization_resolution)

        # Number of actions chosen between successive
        # target network updates.
        self.target_q_update_frequency = self.config.getint(
            'QLearningAlgo', 'TargetQUpdateFrequency', fallback=10000)

        # Sample size of each minibatch.
        self.minibatch_size = self.config.getint(
            'QLearningAlgo', 'MinibatchSize', fallback=32)

        # Number of replays per update.
        self.replays_per_update = self.config.getint(
            'QLearningAlgo', 'ReplaysPerUpdate', fallback=1)

        # Number of actions chosen between successive SGD updates of Q.
        self.q_update_frequency = self.config.getint(
            'QLearningAlgo', 'QUpdateFrequency', fallback=4)

        # Use Huber loss with \delta=1 when True. Otherwise, use least square
        # loss.
        self.use_error_clipping = self.config.getboolean(
            'QLearningAlgo', 'ErrorClipping', fallback=True)

        # Capacity of replay memory.
        self.replay_memory_capacity = self.config.getint(
            'ExperienceReplay', 'Capacity', fallback=100000)

        # A uniform random policy is run for this number of steps to populate
        # replay memory.
        self.replay_start_size = self.config.getint(
            'ExperienceReplay', 'StartSize', fallback=5000)

        # Use prioritized replay. Fall back to uniform sampling when False .
        self.use_prioritized_replay = self.config.getboolean(
            'ExperienceReplay', 'Prioritized', fallback=False)

        # Used by prioritized replay, to determine how much prioritization is
        # used, with 0 corresponding to uniform.
        self.priority_alpha = self.config.getfloat(
            'ExperienceReplay', 'PriorityAlpha', fallback=0.7)

        # Used by prioritized replay, to anneal the amount of importance
        # sampling correction.
        self.priority_beta = self.config.getfloat(
            'ExperienceReplay', 'PriorityBeta', fallback=0.5)

        # Used by prioritized replay, to prevent transitions not being visited
        # once their error is zero.
        self.priority_epsilon = self.config.getfloat(
            'ExperienceReplay', 'PriorityEpsilon', fallback=0.01)

        # Number of nodes in each hidden layer, starting after the input layer.
        self.hidden_layers = self.config.get(
            'NetworkModel', 'HiddenLayerNodes', fallback='[20]')

        # Maximum norm of gradient per sample. No gradient clipping if the
        # parameter is missing from the config file.
        self.gradient_clipping_threshold = self.config.getfloat(
            'Optimization', 'GradientClippingThreshold', fallback=np.inf)

        # Use Double Q-learning if true.
        self.double_q_learning = self.config.getboolean(
            'QLearningAlgo', 'DoubleQLearning', fallback=False)

[docs]    def save(self, config_file):
        with open(config_file, 'w') as c:
            self.config.write(c)