Source code for cntk.contrib.netopt.custom_convolution_ops

# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
import numpy as np
import cntk as C
from cntk.ops.functions import UserFunction

# custom sign function using the straight through estimator as gradient. This is implemented without a numpy intermediate
# representation, giving a much faster run time.
[docs]class SignWithEstimation(UserFunction):
    # initialize by creating new userfunction and assigning function and grad inputs and functions
    def __init__(self, arg, name='SignWithEstimation'):
        super(SignWithEstimation, self).__init__([arg], as_numpy=False, name=name)
        # create an input variable and a function object for the forward propagated function
        self.action, self.actionArg = self.signFunc(arg)
        # create a binary input gradient function, two input variables and the function object. gradroot is the incoming
        # gradient from stages down the pipeline and gradarg is the argument we need to do our new gradient. In our
        # case, we need the inputs to the forward function.
        self.grad, self.gradArg, self.gradRoot = self.gradFunc(arg)

    # define the forward propagation function y = sign(x)
[docs]    def signFunc(self, arg):
        # create an input variable that matches the dimension of the input argument        
        signIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        # create the first stage of the sign function, check if input is greater than zero
        actionfunc = C.greater(signIn, 0)
        # return the second stage of the sign function, replace any 0s with -1s
        return C.element_select(actionfunc, actionfunc, -1), signIn

    # define the backward propagation function, delta_out = delta_in * (|x| <= 1) ? 1 : 0.
[docs]    def gradFunc(self, arg):
        # create an input variable corresponding the inputs of the forward prop function
        gradIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        # create an input variable for the gradient passed from the next stage
        gradRoot = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        # first step is to take absolute value of input arg
        signGrad = C.abs(gradIn)
        # then compare its magnitude to 1
        signGrad = C.less_equal(signGrad, 1)
        # finish by multiplying this result with the input gradient
        return C.element_times(gradRoot, signGrad), gradIn, gradRoot

    # define what should happen when a customsign function object is forwarded
[docs]    def forward(self, argument, device, outputs_to_retain):
        # perform forward on the action function object. To do this, we have to map argument to the actions input,
        # actionArg, we specify that the outputs should be stored in action.outputs and set as_numpy to false to get
        # things running faster
        _, output_values = self.action.forward({self.actionArg: argument}, self.action.outputs, device=device,
                                               as_numpy=False)
        # first return argument is what to store in state, in our case, we need to keep the inputs to forward to compute
        # the straight through estimator gradient. Second output is simply selecting the proper output from output_values.
        return argument.deep_clone(), output_values[self.action.output]

    # define what should happend when our function has backward performed on it
[docs]    def backward(self, state, root_gradients):
        # first extract the value passed by the state, in this case the argument to forward.
        val = state
        # now perform a forward on our gradient function to compute the proper outputs. Map val to gradArg and the
        # root gradient to gradRoot to properly set up the binary gradient inputs.
        _, output_values = self.grad.forward({self.gradArg: val, self.gradRoot: root_gradients},
                                             self.grad.outputs, device=state.device(), as_numpy=False)
        # return the proper output
        return C.output_values[self.grad.output]

    # define a function that returns an output_variable with the shape of this function
[docs]    def infer_outputs(self):
        return [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)]


# A different implementation of sign with the straight through estimator for backprop. Has identical outputs but uses numpy instead
# of CNTK intrinsics. This makes it much easier to write but slower for training since data has to be copied between CPU and GPU.
[docs]class pySign(UserFunction):
    def __init__ (self, arg, name='pySign'):
        super(pySign, self).__init__([arg], name=name)

[docs]    def forward(self, argument, device=None, outputs_to_retain=None):
        sign = np.sign(argument)
        np.place(sign, sign==0, -1)
        return argument, sign

[docs]    def backward(self, state, root_gradients):
        input = state
        grad = np.abs(input)
        grad = np.less_equal(grad, 1)
        return grad*root_gradients

[docs]    def infer_outputs(self):
        return [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype,
                                self.inputs[0].dynamic_axes)]

# Similar to CustomSign, Multibit binarizes an input using straight through estimator gradients. However, Multibit also supports a 
# bit_map argument that specifies how many bits to binarize to. Bit_map is a tensor the shape of the input that has a bit value for
# each input value. Although the bit_map will be uniform in most cases, it does support varying bit widths. The kernel variant
# of Multibit computes a scaler for each bit for each kernel. Should only be used on weight values. For activations, use the regular
# Multibit version.
[docs]class MultibitKernel(UserFunction):
    # initialize by creating new userfunction and assigning function and grad inputs and functions
    def __init__(self, arg1, arg2, name='MultibitKernel'):
        super(MultibitKernel, self).__init__([arg1], as_numpy=False, name=name)
        # save the input bit map for later
        self.bit_map = arg2
        # create an input variable and a function object for the forward propagated function
        self.action, self.actionArg = self.multiFunc(arg1)
        # create a binary input gradient function, two input variables and the function object. gradroot is the incoming
        # gradient from stages down the pipeline and gradarg is the argument we need to do our new gradient. In our
        # case, we need the inputs to the forward function.
        self.grad, self.gradArg, self.gradRoot = self.gradFunc(arg1)

    # define the forward propagation function y = sign(x)
[docs]    def multiFunc(self, arg1):
        # load or create the inputs we need
        multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
        bit_map = C.constant(self.bit_map)
        max_bits = self.bit_map.max()
        shape = multiIn.shape
        reformed = C.reshape(multiIn, (-1,))
        # lets compute the means we need
        # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits,
        # it is the difference between the previous bits approximation and the true value.
        carry_over = multiIn
        approx = C.element_times(multiIn, 0)
        # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization
        for i in range(max_bits):
            # determine which values of the input should be binarized to i bits or more            
            hot_vals = (np.greater(bit_map, i)).as_type(float)
            # select only the values which we need to binarize
            valid_vals = C.element_select(hot_vals, carry_over, 0)
            # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels)
            mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1))
            # reshape the mean to match the dimensionality of the input
            mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1))
            # binarize the carry over
            bits = C.greater(carry_over, 0)
            bits = C.element_select(bits, bits, -1)
            bits = C.element_select(hot_vals, bits, 0)
            # add in the equivalent binary representation to the approximation
            approx = C.plus(approx, C.element_times(mean, bits))
            # compute the new carry over
            carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
            
        return approx, multiIn

    # define the backward propagation function, delta_out = delta_in * (|x| <= 1) ? 1 : 0.
[docs]    def gradFunc(self, arg):
        # create an input variable corresponding the inputs of the forward prop function
        gradIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        # create an input variable for the gradient passed from the next stage
        gradRoot = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        signGrad = C.abs(gradIn)
        # new idea, bound of clipping should be a function of the bit map since higher bits can represent higher numbers
        bit_map = C.constant(self.bit_map)
        signGrad = C.less_equal(signGrad, bit_map)
        outGrad = signGrad

        outGrad = C.element_times(gradRoot, outGrad)
    
        return outGrad, gradIn, gradRoot

    # define what should happen when a customsign function object is forwarded
[docs]    def forward(self, argument, device, outputs_to_retain):
        # perform forward on the action function object. To do this, we have to map argument to the actions input,
        # actionArg, we specify that the outputs should be stored in action.outputs and set as_numpy to false to get
        # things running faster
        _, output_values = self.action.forward({self.actionArg: argument}, self.action.outputs, device=device,
                                               as_numpy=False)
        # first return argument is what to store in state, in our case, we need to keep the inputs to forward to compute
        # the straight through estimator gradient. Second output is simply selecting the proper output from output_values.
        return argument.deep_clone(), output_values[self.action.output]

    # define what should happend when our function has backward performed on it
[docs]    def backward(self, state, root_gradients):
        # first extract the value passed by the state, in this case the argument to forward.
        val = state
        # now perform a forward on our gradient function to compute the proper outputs. Map val to gradArg and the # root gradient to gradRoot to properly set up the binary gradient inputs.
        _, output_values = self.grad.forward({self.gradArg: val, self.gradRoot: root_gradients},
                                             self.grad.outputs, device=state.device(), as_numpy=False)
        # return the proper output
        return output_values[self.grad.output]

    # define a function that returns an output_variable with the shape of this function
[docs]    def infer_outputs(self):
        return [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)]



# Similar to Multibit kernel but computes a single input-wide scaler per bit
[docs]class Multibit(UserFunction):
    # initialize by creating new userfunction and assigning function and grad inputs and functions
    def __init__(self, arg1, arg2, name='Multibit'):
        super(Multibit, self).__init__([arg1], as_numpy=False, name=name)
        self.bit_map = arg2

[docs]    def multiFunc(self, arg1):
        multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
        bit_map = C.constant(self.bit_map)
        max_bits = self.bit_map.max()
        shape = multiIn.shape
        reformed = C.reshape(multiIn, (-1,))
        carry_over = multiIn
        approx = C.element_times(multiIn, 0)
        for i in range(max_bits):
            hot_vals = C.greater(bit_map, i)
            valid_vals = C.element_select(hot_vals, carry_over, 0)
            mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals))
            bits = C.greater(carry_over, 0)
            bits = C.element_select(bits, bits, -1)
            bits = C.element_select(hot_vals, bits, 0)
            approx = C.plus(approx, C.element_times(mean, bits))
            carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
            
        return approx, multiIn

    # define the backward propagation function, delta_out = delta_in * (|x| <= 1) ? 1 : 0.
[docs]    def gradFunc(self, arg):
        # create an input variable corresponding the inputs of the forward prop function
        gradIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
        # create an input variable for the gradient passed from the next stage
        gradRoot = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
  
        signGrad = C.abs(gradIn)
        # new idea, bound of clipping should be a function of the bit map since higher bits can represent higher numbers
        bit_map = C.constant(self.bit_map)
        signGrad = C.less_equal(signGrad, bit_map)
        outGrad = signGrad

        outGrad = C.element_times(gradRoot, outGrad)
    
        return outGrad, gradIn, gradRoot

    # define what should happen when a customsign function object is forwarded
[docs]    def forward(self, argument, device, outputs_to_retain):
        # perform forward on the action function object. To do this, we have to map argument to the actions input,
        # actionArg, we specify that the outputs should be stored in action.outputs and set as_numpy to false to get
        # things running faster
        _, output_values = self.action.forward({self.actionArg: argument}, self.action.outputs, device=device,
                                               as_numpy=False)
        # first return argument is what to store in state, in our case, we need to keep the inputs to forward to compute
        # the straight through estimator gradient. Second output is simply selecting the proper output from output_values.
        return argument.deep_clone(), output_values[self.action.output]

    # define what should happend when our function has backward performed on it
[docs]    def backward(self, state, root_gradients):
        # first extract the value passed by the state, in this case the argument to forward.
        val = state
        # now perform a forward on our gradient function to compute the proper outputs. Map val to gradArg and the # root gradient to gradRoot to properly set up the binary gradient inputs.
        _, output_values = self.grad.forward({self.gradArg: val, self.gradRoot: root_gradients},
                                             self.grad.outputs, device=state.device(), as_numpy=False)
        # return the proper output
        return output_values[self.grad.output]

    # define a function that returns an output_variable with the shape of this function
[docs]    def infer_outputs(self):
        output_vars = [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)]

        self.action, self.actionArg = self.multiFunc(self.inputs[0])
        self.grad, self.gradArg, self.gradRoot = self.gradFunc(self.inputs[0])

        return output_vars

[docs]    def serialize(self):
        return {'bit_map': np.asarray(self.bit_map, dtype=np.float32)}

    @staticmethod
[docs]    def deserialize(inputs, name, state):
        return Multibit(inputs[0], np.asarray(state['bit_map'], dtype=np.int32), name)

[docs]    def clone(self, cloned_inputs):
        cloned_inputs[0].__class__ = C.Variable
        return Multibit(cloned_inputs[0], self.bit_map, self.name)

# these are the face of the custom functions, they simply instantiate a custom function by calling user_function
[docs]def CustomSign(input):
    return C.user_function(SignWithEstimation(input))

[docs]def CustomPySign(input):
    return C.user_function(pySign(input))

[docs]def CustomMultibit(input, bit_map, mean_bits=None):
    if (mean_bits):
        bit_map = np.asarray(np.maximum(np.round(np.random.normal(mean_bits, 1, input.shape)), 1), dtype=np.int32)
        print("Mean Bits: ",np.mean(bit_map))
    else:
        if (type(bit_map) == int):
            length = C.reshape(input, (-1))
            bit_map = [bit_map]*length.shape[0]
            bit_map = np.asarray(bit_map)
            bit_map = bit_map.reshape(input.shape)
        else:
            bit_map = np.asarray(bit_map)
    assert (bit_map.shape == input.shape)
    return C.user_function(Multibit(input, bit_map))

[docs]def CustomMultibitKernel(input, bit_map, mean_bits=None):
    if (mean_bits):
        bit_map = np.asarray(np.maximum(np.round(np.random.normal(mean_bits, 1, input.shape)), 1), dtype=np.int32)
        print("Mean Bits: ",np.mean(bit_map))
    else:
        if (type(bit_map) == int):
            length = C.reshape(input, (-1))
            bit_map = [bit_map]*length.shape[0]
            bit_map = np.asarray(bit_map)
            bit_map = bit_map.reshape(input.shape)
        else:
            bit_map = np.asarray(bit_map)
    assert (bit_map.shape == input.shape)
    return C.user_function(MultibitKernel(input, bit_map))