Source code for cntk.contrib.deeprl.agent.policy_gradient

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""Actor-Critic Policy Gradient."""

import cntk as C
import numpy as np

import ast

from .agent import AgentBaseClass
from .shared.cntk_utils import negative_of_entropy_with_softmax
from .shared.models import Models
from .shared.policy_gradient_parameters import PolicyGradientParameters


[docs]class ActorCritic(AgentBaseClass): """ Actor-Critic Policy Gradient. See https://arxiv.org/pdf/1602.01783.pdf for a description of algorithm. """ def __init__(self, config_filename, o_space, a_space): """ Constructor for policy gradient. Args: config_filename: configure file specifying training details. o_space: observation space, gym.spaces.tuple_space.Tuple is not supported. a_space: action space, limits to gym.spaces.discrete.Discrete. """ super(ActorCritic, self).__init__(o_space, a_space) self._parameters = PolicyGradientParameters(config_filename) # Create preprocessor. if self._parameters.preprocessing: preproc = self._import_method(self._parameters.preprocessing) self._preprocessor = preproc( self._shape_of_inputs, *ast.literal_eval(self._parameters.preprocessing_args)) self._set_up_policy_network_and_value_network() self._trajectory_states = [] self._trajectory_actions = [] self._trajectory_rewards = [] # Training data for the policy and value networks. Note they share the # same input. self._input_buffer = [] self._value_network_output_buffer = [] self._policy_network_output_buffer = [] self._policy_network_weight_buffer = [] self.episode_count = 0 self.step_count = 0
[docs] def start(self, state): """ Start a new episode. Args: state (object): observation provided by the environment. Returns: action (int): action choosen by agent. debug_info (dict): auxiliary diagnostic information. """ # Call _process_accumulated_trajectory() to process unused trajectory # data from previous episode. self._process_accumulated_trajectory(False) # Reset preprocessor. if self._preprocessor is not None: self._preprocessor.reset() # Append new state and action o = self._preprocess_state(state) action, _ = self._choose_action(o) self._trajectory_states.append(o) self._trajectory_actions.append(action) self.episode_count += 1 return action, {}
[docs] def step(self, reward, next_state): """ Observe one transition and choose an action. Args: reward (float) : amount of reward returned after previous action. next_state (object): observation provided by the environment. Returns: action (int): action choosen by agent. debug_info (dict): auxiliary diagnostic information. """ o = self._preprocess_state(next_state) self._trajectory_rewards.append(reward) self._trajectory_states.append(o) self.step_count += 1 # Update every self._parameters.update_frequency if self.step_count % self._parameters.update_frequency == 0: self._process_accumulated_trajectory(True) self._update_networks() action, _ = self._choose_action(o) self._trajectory_actions.append(action) return action, {}
[docs] def end(self, reward, next_state): """ Last observed reward/state of the episode (which then terminates). Args: reward (float) : amount of reward returned after previous action. next_state (object): observation provided by the environment. """ self._trajectory_rewards.append(reward) self.step_count += 1 # Update every self._parameters.update_frequency if self.step_count % self._parameters.update_frequency == 0: self._process_accumulated_trajectory(False) self._update_networks()
[docs] def set_as_best_model(self): """Copy current model to best model.""" self._best_model = self._policy_network.clone('clone')
def _set_up_policy_network_and_value_network(self): shape_of_inputs = self._shape_of_inputs if self._preprocessor is None \ else self._preprocessor.output_shape() self._input_variables = \ C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32) # Set up policy network. if self._parameters.policy_representation == 'nn': model = Models.feedforward_network( shape_of_inputs, self._num_actions, self._parameters.policy_network_hidden_layers, C.losses.cross_entropy_with_softmax, use_placeholder_for_input=True) else: try: model_definition_function = self._import_method( self._parameters.policy_representation) model = model_definition_function( shape_of_inputs, self._num_actions, C.losses.cross_entropy_with_softmax, use_placeholder_for_input=True) except ValueError: raise ValueError( 'Unknown representation for policy: "{0}"' '\n'.format(self._parameters.policy_representation)) self._policy_network = model['f'] self._policy_network.replace_placeholder(self._input_variables) self._policy_network_output_variables = model['outputs'] # The weight is computed as part of the Actor-Critic algorithm. self._policy_network_weight_variables = \ C.ops.input_variable(shape=(1,), dtype=np.float32) self._policy_network_loss = \ model['loss'] * self._policy_network_weight_variables # Initialized from a saved model. if self._parameters.initial_policy_network: self._policy_network.restore( self._parameters.initial_policy_network) print("Parameterized the agent's policy using neural networks " '"{0}" with {1} actions.\n' ''.format(self._parameters.policy_representation, self._num_actions)) # Set up value network. if self._parameters.shared_representation: # For shared representation, policy pi and value function V share # all non-output layers. To use cross_entropy_with_softmax loss # from cntk, _policy_network defined here doesn't include softmax # output layer. Therefore _value_network becomes _policy_network # plus one additional linear output layer. self._value_network = C.layers.Dense(1, activation=None)( self._policy_network) self._value_network_output_variables = C.ops.input_variable( shape=(1,), dtype=np.float32) self._value_network_loss = C.losses.squared_error( self._value_network, self._value_network_output_variables) else: if self._parameters.value_function_representation == 'nn': model = Models.feedforward_network( shape_of_inputs, 1, # value network outputs a scalar self._parameters.value_network_hidden_layers, use_placeholder_for_input=True) else: try: model_definition_function = self._import_method( self._parameters.value_function_representation) model = model_definition_function( shape_of_inputs, 1, # value network outputs a scalar use_placeholder_for_input=True) except ValueError: raise ValueError( 'Unknown representation for value function: "{0}"' '\n'.format(self._parameters.value_function_representation)) self._value_network = model['f'] self._value_network.replace_placeholder(self._input_variables) self._value_network_output_variables = model['outputs'] self._value_network_loss = model['loss'] # squared_error by default combined_networks = C.ops.combine( [self._policy_network, self._value_network]) combined_loss = self._policy_network_loss + \ self._parameters.regularization_weight * \ negative_of_entropy_with_softmax(self._policy_network) + \ self._parameters.relative_step_size * self._value_network_loss # The learning rate will be updated later before each minibatch # training. # TODO: allow user to specify learner through config file. self._trainer = C.train.trainer.Trainer( combined_networks, (combined_loss, None), C.learners.adam( combined_networks.parameters, C.learners.learning_parameter_schedule_per_sample( self._parameters.initial_eta), momentum=C.learners.momentum_schedule(self._parameters.momentum), variance_momentum=C.learners.momentum_schedule(0.999), minibatch_size=C.learners.IGNORE)) print("Parameterized the agent's value function using neural network " '"{0}".\n'.format( self._parameters.policy_representation if self._parameters.shared_representation else self._parameters.value_function_representation)) def _adjust_learning_rate(self): if self._parameters.initial_eta != self._parameters.eta_minimum: eta = self._parameters.eta_minimum + max( 0, (self._parameters.initial_eta - self._parameters.eta_minimum) * (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) self._trainer.parameter_learners[0].reset_learning_rate( C.learners.learning_parameter_schedule_per_sample(eta)) def _choose_action(self, state): """ Choose an action according to policy. Args: state (object): observation seen by agent, which can be different from what is provided by the environment. The difference comes from preprcessing. Returns: action (int): action choosen by agent. debug_info (object): probability vector the action is sampled from. """ action_probs = \ C.ops.softmax(self._evaluate_model(self._policy_network, state)).eval() return np.random.choice(self._num_actions, p=action_probs), action_probs
[docs] def save(self, filename): """Save model to file.""" self._best_model.save(filename)
[docs] def save_parameter_settings(self, filename): """Save parameter settings to file.""" self._parameters.save(filename)
def _evaluate_model(self, model, state): r"""Evaluate log of pi(\cdot|state) or v(state).""" return np.squeeze(model.eval({model.arguments[0]: [state]})) def _process_accumulated_trajectory(self, keep_last): """Process accumulated trajectory to generate training data. Args: keep_last (bool): last state without action and reward will be kept if True. """ if not self._trajectory_states: return # If trajectory hasn't terminated, we have _trajectory_states # and sometimes _trajectory_actions having one more item than # _trajectory_rewards. Same length is expected if called from # start() or end(), where the trajectory has terminiated. if len(self._trajectory_states) == len(self._trajectory_rewards): bootstrap_r = 0 else: # Bootstrap from last state bootstrap_r = np.asscalar(self._evaluate_model( self._value_network, self._trajectory_states[-1])) last_state = self._trajectory_states.pop() if len(self._trajectory_actions) != len(self._trajectory_rewards): # This will only happen when agent calls start() to begin # a new episode without calling end() before to terminate the # prevous episode. The last action thus can be discarded. self._trajectory_actions.pop() if len(self._trajectory_states) != len(self._trajectory_rewards) or \ len(self._trajectory_actions) != len(self._trajectory_rewards): raise RuntimeError("Can't pair (state, action, reward). " "state/action can only be one more step ahead " "of rewrad in trajectory.") for transition in zip( self._trajectory_states, self._trajectory_actions, self._discount_rewards(bootstrap_r)): self._input_buffer.append(transition[0]) self._value_network_output_buffer.append([transition[2]]) # TODO: consider using cntk.ops.one_hot instead of _index_to_vector self._policy_network_output_buffer.append( self._index_to_vector(transition[1], self._num_actions)) self._policy_network_weight_buffer.append([transition[2] - self._evaluate_model(self._value_network, transition[0])]) # Clear the trajectory history. self._trajectory_states = [] self._trajectory_actions = [] self._trajectory_rewards = [] if keep_last: self._trajectory_states.append(last_state) def _update_networks(self): self._adjust_learning_rate() # Train the policy network on one minibatch. self._trainer.train_minibatch( { self._input_variables: np.array(self._input_buffer).astype( np.float32), self._policy_network_output_variables: np.array(self._policy_network_output_buffer).astype( np.float32), self._policy_network_weight_variables: np.array(self._policy_network_weight_buffer).astype( np.float32), self._value_network_output_variables: np.array(self._value_network_output_buffer).astype( np.float32) }) # Clear training data. self._input_buffer = [] self._value_network_output_buffer = [] self._policy_network_output_buffer = [] self._policy_network_weight_buffer = [] def _discount_rewards(self, bootstrap_r): discounted_rewards = [0] * len(self._trajectory_rewards) r = bootstrap_r for t in reversed(range(len(self._trajectory_rewards))): r = r * self._parameters.gamma + self._trajectory_rewards[t] discounted_rewards[t] = r return discounted_rewards