Source code for lingvo.core.egdd

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Exponentiated Gradient Delta-Delta optimizer."""

# pylint: disable=g-direct-tensorflow-import
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.training import optimizer
# pylint: enable=g-direct-tensorflow-import


[docs]class EGDD(optimizer.Optimizer):
  """A version of GD Momentum with adaptive gain and learning rate.

  Exponentiated Gradient Delta-delta optimizer starts with a local gain of 1.0
  for every weight and a lr_scale of 1.0 for all weights. The EGDD update rule
  applies:

  momentum <- mu * momentum + learning_rate * gain * grad
  var  <- var - lr_scale * momentum

  The gain as well as the lr_scale are updated using the unnormalized
  exponentiated gradient algorithm [KW97].

  Reference: TBA

  [KW97] Kivinen, J., & Warmuth, M. K. Exponentiated gradient versus gradient
  descent for linear predictors. Information and Computation, 1997.
  """

  def __init__(self,
               learning_rate,
               momentum,
               beta=0.9,
               gain_learning_rate=0.01,
               scale_learning_rate=0.001,
               initial_gain=1.0,
               min_gain=1e-2,
               max_gain=1e2,
               initial_scale=1.0,
               min_scale=1e-1,
               max_scale=1e1,
               use_directions=True,
               use_signs=True,
               name="EGDD"):
    """Construct a new EG-DD optimizer.

    Args:
      learning_rate: A `Tensor` or a floating point value. The learning rate.
      momentum: A `Tensor` or a floating point value.
      beta: `float` decay rate of the gradient EMA.
      gain_learning_rate: `float` gain learning rate.
      scale_learning_rate: `float` scale learning rate.
      initial_gain: `float` initial gain.
      min_gain: `float` minimum gain.
      max_gain: `float` maximum gain,
      initial_scale: `float` initial scale.
      min_scale: `float` minimum learning rate scale.
      max_scale: `float` maximum learning rate scale.
      use_directions: `bool` whether to use directions only for scale updates.
      use_signs: `bool` whether to use the signs for updating gains.
      name: Optional name prefix for the operations created when applying
        gradients.

    Raises:
      ValueError: If the `initial_accumulator_value` is invalid.

    """
    super().__init__(False, name)
    self._learning_rate = learning_rate
    self._momentum = momentum
    self._beta = beta
    self._gain_learning_rate = gain_learning_rate
    self._scale_learning_rate = scale_learning_rate
    self._initial_gain = initial_gain
    self._min_gain = min_gain
    self._max_gain = max_gain
    self._initial_scale = initial_scale
    self._min_scale = min_scale
    self._max_scale = max_scale
    self._use_directions = use_directions
    self._use_signs = use_signs

[docs]  def _create_slots(self, var_list):
    for v in var_list:
      self._zeros_slot(v, "momentum", self._name)
      self._zeros_slot(v, "gbar", self._name)
      g_tensor = ops.convert_to_tensor(v)
      gain_init = self._initial_gain * array_ops.ones_like(g_tensor)
      _ = self._get_or_make_slot(v, self._initial_scale * array_ops.ones((1)),
                                 "lr_scale", self._name)
      _ = self._get_or_make_slot(v, gain_init, "gain", self._name)
      _ = self._get_or_make_slot(v, array_ops.zeros((1)), "counter", self._name)

[docs]  def _prepare(self):
    learning_rate = self._call_if_callable(self._learning_rate)
    self._learning_rate_tensor = ops.convert_to_tensor(
        learning_rate, name="learning_rate")
    momentum = self._call_if_callable(self._momentum)
    self._momentum_tensor = ops.convert_to_tensor(momentum, name="momentum")

[docs]  def _apply_dense(self, grad, var):
    lr_scale = self.get_slot(var, "lr_scale")
    momentum = self.get_slot(var, "momentum")
    gbar = self.get_slot(var, "gbar")
    gain = self.get_slot(var, "gain")
    counter = self.get_slot(var, "counter")
    counter_updated = state_ops.assign(counter, counter + 1)

    # lr_scale update uses normalized grad and momentum to be independent of dim
    normalized_grad = grad / (linalg_ops.norm(grad) + 1e-10)
    normalized_momentum = momentum / (linalg_ops.norm(momentum) + 1e-10)
    # Apply EG updates on lr_scale:
    # grad_lr_scale = -inner_product(current_grad, old_momentum)
    # lr_scale <- lr_scale * exp(-scale_learning_rate * grad_lr_scale)
    lr_scale_unnormalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(
            self._scale_learning_rate * math_ops.reduce_sum(grad * momentum)),
        self._min_scale, self._max_scale)
    lr_scale_normalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(self._scale_learning_rate * math_ops.reduce_sum(
            normalized_grad * normalized_momentum)), self._min_scale,
        self._max_scale)
    lr_scale_updated = state_ops.assign(
        lr_scale,
        array_ops.where(self._use_directions, lr_scale_normalized_updated,
                        lr_scale_unnormalized_updated))
    # remove the bias of zero initialization in gbar
    corrected_gbar = gbar / (
        1.0 - self._beta**math_ops.maximum(counter_updated - 1, 1))
    # Apply EG updates on gain:
    # grad_gain = - current_grad * old_gbar
    # gain <- gain * exp(-gain_learning_rate * grad_gain)
    gain_unnormalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * grad * corrected_gbar),
        self._min_gain, self._max_gain)
    # Normalized update uses sign(grad) * sign(gbar) as a proxy for grad_gain.
    gain_normalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * math_ops.sign(grad) *
                            math_ops.sign(gbar)), self._min_gain,
        self._max_gain)
    gain_updated = state_ops.assign(
        gain,
        array_ops.where(self._use_signs, gain_normalized_updated,
                        gain_unnormalized_updated))
    scaled_g = self._learning_rate_tensor * gain_updated * grad
    with ops.control_dependencies([lr_scale_updated, scaled_g]):
      momentum_updated = state_ops.assign(
          momentum, self._momentum_tensor * momentum + scaled_g)
      gbar_updated = state_ops.assign(
          gbar, self._beta * gbar + (1.0 - self._beta) * grad)
    with ops.control_dependencies([gbar_updated]):
      return state_ops.assign_sub(var, lr_scale_updated * momentum_updated)

[docs]  def _resource_apply_dense(self, grad, var):
    return self._apply_dense(grad, var)

  # Sparse gradients are not handled currently and is part of future work.
[docs]  def _resource_apply_sparse(self, grad_values, var, grad_indices):
    return control_flow_ops.no_op()

[docs]  def _apply_sparse(self, grad, var):
    return control_flow_ops.no_op()