Source code for optimizer

#!/usr/bin/env python
# -*- coding: utf-8 -*-


from __future__ import division
from __future__ import print_function

import numpy as np

__author__ = ['Mattia Ceccarelli', 'Nico Curti']
__email__ = ['mattia.ceccarelli3@studio.unibo.it', 'nico.curti2@unibo.it']


[docs]class Optimizer (object):

  '''
  Abstract base class for the optimizers

  Parameters
  ----------
    lr : float (default=2e-2)
      Learning rate value

    decay : float (default=0.)
      Learning rate decay

    lr_min : float (default=0.)
      Minimum of learning rate domain

    lr_max : float (default=np.inf)
      Maximum of learning rate domain

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, lr=1e-3, decay=0., lr_min=0., lr_max=np.inf, *args, **kwargs):

    self.lr = lr
    self.decay = decay
    self.lr_min = lr_min
    self.lr_max = lr_max

    self.iterations = 1

[docs]  def update (self, params, gradients):
    '''
    Update the optimizer parameters

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      self
    '''
    self.lr *= 1. / (self.decay * self.iterations + 1.)
    self.lr  = np.clip(self.lr, self.lr_min, self.lr_max)

    self.iterations += 1

  def __repr__ (self):
    '''
    Representation
    '''
    class_name = self.__class__.__qualname__
    try:
      params = super(type(self), self).__init__.__code__.co_varnames
    except AttributeError:
      params = self.__init__.__code__.co_varnames

    params = set(params) - {'self', 'args', 'kwargs'}
    args = ', '.join(['{0}={1}'.format(k, str(getattr(self, k)))
                      if not isinstance(getattr(self, k), str) else '{0}="{1}"'.format(k, str(getattr(self, k)))
                      for k in params])
    return '{0}({1})'.format(class_name, args)

  def __str__ (self):
    '''
    Printer
    '''
    return self.__class__.__name__


[docs]class SGD (Optimizer):

  '''
  Stochastic Gradient Descent specialization

  Update the parameters according to the rule

  .. code-block:: python

    parameter -= learning_rate * gradient

  Parameters
  ----------
    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, *args, **kwargs):

    super(SGD, self).__init__(*args, **kwargs)

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''
    for p, g in zip(params, gradients):
      p -= self.lr * g  # np.clip(g, -1., 1.)

    super(SGD, self).update(params, gradients)

    return params


[docs]class Momentum (Optimizer):

  '''
  Stochastic Gradient Descent with Momentum specialiation

  Update the parameters according to the rule

  .. code-block:: python

    v = momentum * v - lr * gradient
    parameter += v - learning_rate * gradient


  Parameters
  ----------
    momentum : float (default=0.9)
      Momentum value

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, momentum=.9, *args, **kwargs):

    super(Momentum, self).__init__(*args, **kwargs)
    self.momentum = momentum

    self.velocity = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''

    if self.velocity is None:
      self.velocity = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (v, p, g) in enumerate(zip(self.velocity, params, gradients)):
      v  = self.momentum * v - self.lr * g  # np.clip(g, -1., 1.)
      p += v
      self.velocity[i] = v

    super(Momentum, self).update(params, gradients)

    return params


[docs]class NesterovMomentum (Optimizer):

  '''
  Stochastic Gradient Descent with Nesterov Momentum specialiation

  Update the parameters according to the rule

  .. code-block:: python

    v = momentum * v - lr * gradient
    parameter += momentum * v - learning_rate * gradient

  Parameters
  ----------
    momentum : float (default=0.9)
      Momentum value

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, momentum=.9, *args, **kwargs):

    super(NesterovMomentum, self).__init__(*args, **kwargs)
    self.momentum = momentum

    self.velocity = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''

    if self.velocity is None:
      self.velocity = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (v, p, g) in enumerate(zip(self.velocity, params, gradients)):
      v  = self.momentum * v - self.lr * g  # np.clip(g, -1., 1.)
      p += self.momentum * v - self.lr * g  # np.clip(g, -1., 1.)
      self.velocity[i] = v

    super(NesterovMomentum, self).update(params, gradients)

    return params


[docs]class Adagrad (Optimizer):

  '''
  Adagrad optimizer specialization

  Update the parameters according to the rule

  .. code-block:: python

    c += gradient * gradient
    parameter -= learning_rate * gradient / (sqrt(c) + epsilon)

  Parameters
  ----------
    epsilon : float (default=1e-6)
      Precision parameter to overcome numerical overflows

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, epsilon=1e-6, *args, **kwargs):

    super(Adagrad, self).__init__(*args, **kwargs)
    self.epsilon = epsilon

    self.cache = None

[docs]  def update (self, params, gradients):

    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''

    if self.cache is None:
      self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (c, p, g) in enumerate(zip(self.cache, params, gradients)):

      c += g * g
      p -= self.lr * g / (np.sqrt(c) + self.epsilon)
      self.cache[i] = c

    super(Adagrad, self).update(params, gradients)

    return params


[docs]class RMSprop (Optimizer):

  '''
  RMSprop optimization algorithm

  Update the parameters according to the rule

  .. code-block:: python

    c = rho * c + (1. - rho) * gradient * gradient
    parameter -= learning_rate * gradient / (sqrt(c) + epsilon)

  Parameters
  ----------
    rho : float (default=0.9)
      Decay factor

    epsilon : float (default=1e-6)
      Precision parameter to overcome numerical overflows

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, rho=.9, epsilon=1e-6, *args, **kwargs):

    super(RMSprop, self).__init__(*args, **kwargs)

    self.rho = rho
    self.epsilon = epsilon

    self.cache = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''

    if self.cache is None:
      self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (c, p, g) in enumerate(zip(self.cache, params, gradients)):

      c = self.rho * c + (1 - self.rho) * g * g
      p -= (self.lr * g / (np.sqrt(c) + self.epsilon))
      self.cache[i] = c

    super(RMSprop, self).update(params, gradients)

    return params


[docs]class Adadelta (Optimizer):

  '''
  AdaDelta optimization algorithm

  Update the parameters according to the rule

  .. code-block:: python

    c = rho * c + (1. - rho) * gradient * gradient
    update = gradient * sqrt(d + epsilon) / (sqrt(c) + epsilon)
    parameter -= learning_rate * update
    d = rho * d + (1. - rho) * update * update

  Parameters
  ----------
    rho : float (default=0.9)
      Decay factor

    epsilon : float (default=1e-6)
      Precision parameter to overcome numerical overflows

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, rho=0.9, epsilon=1e-6, *args, **kwargs):

    super(Adadelta, self).__init__(*args, **kwargs)

    self.rho = rho
    self.epsilon = epsilon

    self.cache = None
    self.delta = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''

    if self.cache is None:
      self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]

    if self.delta is None:
      self.delta = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (c, d, p, g) in enumerate(zip(self.cache, self.delta, params, gradients)):

      c = self.rho * c + (1 - self.rho) * g * g
      update = g * np.sqrt(d + self.epsilon) / np.sqrt(c + self.epsilon)
      p -= self.lr * update
      d = self.rho * d + (1 - self.rho) * update * update

      self.cache[i] = c
      self.delta[i] = d

    super(Adadelta, self).update(params, gradients)

    return params


[docs]class Adam (Optimizer):

  '''
  Adam optimization algorithm

  Update the parameters according to the rule

  .. code-block:: python

    at  = learning_rate * sqrt(1 - B2**iterations) / (1 - B1**iterations)
    m = B1 * m + (1 - B1) * gradient
    v = B2 * m + (1 - B2) * gradient * gradient
    parameter -= at * m / (sqrt(v) + epsilon)

  Parameters
  ----------
    beta1 : float (default=0.9)
      B1 factor

    beta2 : float (default=0.999)
      B2 factor

    epsilon : float (default=1e-8)
      Precision parameter to overcome numerical overflows

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):

    super(Adam, self).__init__(*args, **kwargs)

    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon

    self.ms = None
    self.vs = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''
    a_t = self.lr * np.sqrt(1 - np.power(self.beta2, self.iterations)) / \
          (1 - np.power(self.beta1, self.iterations))

    if self.ms is None:
      self.ms = [np.zeros(shape=p.shape, dtype=float) for p in params]

    if self.vs is None:
      self.vs = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, gradients)):

      m = self.beta1 * m + (1 - self.beta1) * g
      v = self.beta2 * v + (1 - self.beta2) * g * g
      p -= a_t * m / (np.sqrt(v) + self.epsilon)

      self.ms[i] = m
      self.vs[i] = v

    super(Adam, self).update(params, gradients)

    return params


[docs]class Adamax (Optimizer):

  '''
  Adamax optimization algorithm

  Update the parameters according to the rule

  .. code-block:: python

    at  = learning_rate / (1 - B1**iterations)
    m = B1 * m + (1 - B1) * gradient
    v = max(B2 * v, abs(gradient))
    parameter -= at * m / (v + epsilon)

  Parameters
  ----------
    beta1 : float (default=0.9)
      B1 factor

    beta2 : float (default=0.999)
      B2 factor

    epsilon : float (default=1e-8)
      Precision parameter to overcome numerical overflows

    *args : list
      Class specialization variables.

    **kwargs : dict
      Class Specialization variables.
  '''

  def __init__ (self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):

    super(Adamax, self).__init__(*args, **kwargs)

    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon

    self.ms = None
    self.vs = None

[docs]  def update (self, params, gradients):
    '''
    Update the given parameters according to the class optimization algorithm

    Parameters
    ----------
      params : list
        List of parameters to update

      gradients : list
        List of corresponding gradients

    Returns
    -------
      params : list
        The updated parameters
    '''
    a_t = self.lr / (1 - np.power(self.beta1, self.iterations))

    if self.ms is None:
      self.ms = [np.zeros(shape=p.shape, dtype=float) for p in params]

    if self.vs is None:
      self.vs = [np.zeros(shape=p.shape, dtype=float) for p in params]

    for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, gradients)):
      m = self.beta1 * m + (1 - self.beta1) * g
      v = np.maximum(self.beta2 * v, np.abs(g))
      p -= a_t * m / (v + self.epsilon)

      self.ms[i] = m
      self.vs[i] = v

    super(Adamax, self).update(params, gradients)

    return params