Source code for lingvo.core.activations

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Activations layers."""

from lingvo import compat as tf
from lingvo.core import base_layer
from lingvo.core import py_utils

import numpy as np

# Supported activation functions.
_ACTIVATIONS = {
    'RELU':
        tf.nn.relu,
    'RELU6':
        tf.nn.relu6,
    'LEAKY_RELU':
        tf.nn.leaky_relu,
    'SIGMOID':
        tf.sigmoid,
    'TANH':
        tf.tanh,
    'GELU':
        tf.nn.gelu,
    'GELU_APPROXIMATE':
        lambda x: tf.nn.gelu(x, approximate=True),
    'GELU_RAW':
        lambda x: 0.5 * x * (  # pylint: disable=g-long-lambda
            1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))),
    'SWISH':
        tf.nn.swish,
    'SOFTPLUS':
        tf.nn.softplus,
    # Squared ReLU from the Primer paper: https://arxiv.org/abs/2109.08668
    'SQUARED_RELU':
        lambda x: tf.math.square(tf.nn.relu(x)),
    'SILU':
        tf.nn.silu,
    # GLU Variants: https://arxiv.org/abs/2002.05202
    'GLU':
        lambda x: GLUVariants(x, 'SIGMOID'),
    'BILINEAR_GLU':
        lambda x: GLUVariants(x, 'NONE'),
    'RELU_GLU':
        lambda x: GLUVariants(x, 'RELU'),
    'GELU_GLU':
        lambda x: GLUVariants(x, 'GELU'),
    'SWISH_GLU':
        lambda x: GLUVariants(x, 'SWISH'),
    'NONE':
        tf.identity,
}

_ACTIVATIONS_FLOPS = {
    'NONE': 0,
    'RELU': 1,
    'RELU6': 1,
    # ReLU(x) - 0.2 * ReLU(-x)
    # neg, relu, mul, sub, relu
    'LEAKY_RELU': 5,
    # 1 / (1 + exp(-x))
    'SIGMOID': 4,  # neg, exp, add, div
    # (exp(2*x) - 1) / (exp(2*x) + 1)
    'TANH': 7,  # mul, exp, sub, mul, exp, add, div
    # Gelu is tough, let's assume it is
    # .5 * x * (1 + tanh(x * 0.7978845608 * (1 + 0.044715 * x * x)))
    'GELU': 15,  # mul, mul, add, tanh, mul, mul, add, mul, mul
    'GELU_RAW': 15,  # same as GELU
    # Or approximated as x * sigmoid(1.702 * x).
    'GELU_APPROXIMATE': 6,  # mul, sigmoid, mul
    # x * sigmoid(x)
    'SWISH': 5,  # sigmoid, mul
    # ln(1+exp(x))
    'SOFTPLUS': 3,  # exp, add, ln
    'SQUARED_RELU': 2,  # relu, mul
    'GLU': 5,  # SIGMOID, mul
    'BILINEAR_GLU': 1,  # NONE, mul
    'RELU_GLU': 2,  # RELU, mul
    'GELU_GLU': 16,  # GELU, mul
    'SWISH_GLU': 6,  # SWISH, mul
}


[docs]def GetFn(activation_name):
  """Returns function corresponding to the activation name."""
  return _ACTIVATIONS[activation_name]


[docs]def GetFlops(activation_name):
  """Returns FLOPS corresponding to the activation name."""
  return _ACTIVATIONS_FLOPS[activation_name]


[docs]def IsSupported(activation_name):
  """Checks if the activation is supported."""
  return activation_name in _ACTIVATIONS


[docs]def DimMultiplier(activation_name):
  """Returns dimension multiplier for the activation."""
  assert IsSupported(activation_name)
  if activation_name.endswith('GLU'):
    return 2
  return 1


[docs]def GLUVariants(x, activation_name):
  """Returns function corresponding to GLU variants."""
  x1, x2 = tf.split(x, 2, axis=-1)
  return x1 * _ACTIVATIONS[activation_name](x2)


[docs]class ActivationLayer(base_layer.BaseLayer):
  """Activation layer."""

[docs]  @classmethod
  def Params(cls):
    p = super().Params()
    p.Define('activation', 'RELU', 'Activation function to use.')
    return p

[docs]  def FProp(self, theta, inputs, paddings=None):
    """Applies activation fn.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The input tensor.
      paddings: The paddings tensor.

    Returns:
      If paddings is not None, an (output, paddings) tensor, else just the
      output with the same shape and type of inputs.
    """
    p = self.params
    if p.activation == 'NONE':
      ret = inputs
    else:
      with tf.name_scope(p.name):
        ret = GetFn(p.activation)(inputs)
    if paddings is None:
      return ret
    else:
      return ret, paddings

[docs]  @classmethod
  def FPropMeta(cls, p, inputs):
    py_utils.CheckShapes((inputs,))
    return py_utils.NestedMap(
        flops=inputs.num_elements() * GetFlops(p.activation),
        out_shapes=(inputs,))