Source code for cntk.contrib.deeprl.agent.shared.qlearning_parameters

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Q learning parameters."""

import numpy as np

import ast
import configparser


[docs]class QLearningParameters: """Parameters used by Q learning algorithm.""" def __init__(self, config_file): """Read parameter values from config_file. Use default value if the value is not present. """ # TODO: validate parameter values. self.config = configparser.ConfigParser() self.config.optionxform = str self.config.read(config_file) # Discount factor self.gamma = self.config.getfloat( 'General', 'Gamma', fallback=0.95) # Name of class that does preprocessing. self.preprocessing = self.config.get( 'General', 'PreProcessing', fallback='') # Arguments (except the first argument input_shape) of preprocessing as # a tuple. self.preprocessing_args = self.config.get( 'General', 'PreProcessingArgs', fallback='()') # Representation of Q function, taking value from {'tabular', 'nn'}. self.q_representation = self.config.get( 'QLearningAlgo', 'QRepresentation', fallback='tabular') # Initial value of epsilon (exploration rate), used by epsilon-greedy # policy. self.initial_epsilon = self.config.getfloat( 'QLearningAlgo', 'InitialEpsilon', fallback=0.1) # Number of steps before epsilon reaches minimum value. self.epsilon_decay_step_count = self.config.getint( 'QLearningAlgo', 'EpsilonDecayStepCount', fallback=100000) # Minimum value of epsilon. self.epsilon_minimum = self.config.getfloat( 'QLearningAlgo', 'EpsilonMinimum', fallback=0.01) # Initial value of eta, which is the learning rate for gradient # descent. self.initial_eta = self.config.getfloat( 'Optimization', 'InitialEta', fallback=0.001) # Number of steps before eta reaches minimum value. self.eta_decay_step_count = self.config.getint( 'Optimization', 'EtaDecayStepCount', fallback=100000) # Minimum value of eta. Since Adam is used as the optimizer, a good # starting point is to set EtaMinimum equal to InitialEta, which is # equivalent to using a constant learning rate. self.eta_minimum = self.config.getfloat( 'Optimization', 'EtaMinimum', fallback=0.001) # Momentum used by RMSProp. self.momentum = self.config.getfloat( 'Optimization', 'Momentum', fallback=0.95) # Initial value for table entries. # TODO(maoyi): allow DQN initialization through config file. self.initial_q = self.config.getfloat( 'QLearningAlgo', 'InitialQ', fallback=0.0) # Number of partitions for discretizing the continuous space. Either a # scalar which is applied to all dimensions, or a list specifying # different value for different dimension. self.discretization_resolution = ast.literal_eval(self.config.get( 'QLearningAlgo', 'DiscretizationResolution', fallback='10')) if isinstance(self.discretization_resolution, list): self.discretization_resolution = np.array( self.discretization_resolution) # Number of actions chosen between successive # target network updates. self.target_q_update_frequency = self.config.getint( 'QLearningAlgo', 'TargetQUpdateFrequency', fallback=10000) # Sample size of each minibatch. self.minibatch_size = self.config.getint( 'QLearningAlgo', 'MinibatchSize', fallback=32) # Number of replays per update. self.replays_per_update = self.config.getint( 'QLearningAlgo', 'ReplaysPerUpdate', fallback=1) # Number of actions chosen between successive SGD updates of Q. self.q_update_frequency = self.config.getint( 'QLearningAlgo', 'QUpdateFrequency', fallback=4) # Use Huber loss with \delta=1 when True. Otherwise, use least square # loss. self.use_error_clipping = self.config.getboolean( 'QLearningAlgo', 'ErrorClipping', fallback=True) # Capacity of replay memory. self.replay_memory_capacity = self.config.getint( 'ExperienceReplay', 'Capacity', fallback=100000) # A uniform random policy is run for this number of steps to populate # replay memory. self.replay_start_size = self.config.getint( 'ExperienceReplay', 'StartSize', fallback=5000) # Use prioritized replay. Fall back to uniform sampling when False . self.use_prioritized_replay = self.config.getboolean( 'ExperienceReplay', 'Prioritized', fallback=False) # Used by prioritized replay, to determine how much prioritization is # used, with 0 corresponding to uniform. self.priority_alpha = self.config.getfloat( 'ExperienceReplay', 'PriorityAlpha', fallback=0.7) # Used by prioritized replay, to anneal the amount of importance # sampling correction. self.priority_beta = self.config.getfloat( 'ExperienceReplay', 'PriorityBeta', fallback=0.5) # Used by prioritized replay, to prevent transitions not being visited # once their error is zero. self.priority_epsilon = self.config.getfloat( 'ExperienceReplay', 'PriorityEpsilon', fallback=0.01) # Number of nodes in each hidden layer, starting after the input layer. self.hidden_layers = self.config.get( 'NetworkModel', 'HiddenLayerNodes', fallback='[20]') # Maximum norm of gradient per sample. No gradient clipping if the # parameter is missing from the config file. self.gradient_clipping_threshold = self.config.getfloat( 'Optimization', 'GradientClippingThreshold', fallback=np.inf) # Use Double Q-learning if true. self.double_q_learning = self.config.getboolean( 'QLearningAlgo', 'DoubleQLearning', fallback=False)
[docs] def save(self, config_file): with open(config_file, 'w') as c: self.config.write(c)