#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
import numpy as np
__author__ = ['Mattia Ceccarelli', 'Nico Curti']
__email__ = ['mattia.ceccarelli3@studio.unibo.it', 'nico.curti2@unibo.it']
[docs]class Optimizer (object):
'''
Abstract base class for the optimizers
Parameters
----------
lr : float (default=2e-2)
Learning rate value
decay : float (default=0.)
Learning rate decay
lr_min : float (default=0.)
Minimum of learning rate domain
lr_max : float (default=np.inf)
Maximum of learning rate domain
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, lr=1e-3, decay=0., lr_min=0., lr_max=np.inf, *args, **kwargs):
self.lr = lr
self.decay = decay
self.lr_min = lr_min
self.lr_max = lr_max
self.iterations = 1
[docs] def update (self, params, gradients):
'''
Update the optimizer parameters
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
self
'''
self.lr *= 1. / (self.decay * self.iterations + 1.)
self.lr = np.clip(self.lr, self.lr_min, self.lr_max)
self.iterations += 1
def __repr__ (self):
'''
Representation
'''
class_name = self.__class__.__qualname__
try:
params = super(type(self), self).__init__.__code__.co_varnames
except AttributeError:
params = self.__init__.__code__.co_varnames
params = set(params) - {'self', 'args', 'kwargs'}
args = ', '.join(['{0}={1}'.format(k, str(getattr(self, k)))
if not isinstance(getattr(self, k), str) else '{0}="{1}"'.format(k, str(getattr(self, k)))
for k in params])
return '{0}({1})'.format(class_name, args)
def __str__ (self):
'''
Printer
'''
return self.__class__.__name__
[docs]class SGD (Optimizer):
'''
Stochastic Gradient Descent specialization
Update the parameters according to the rule
.. code-block:: python
parameter -= learning_rate * gradient
Parameters
----------
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, *args, **kwargs):
super(SGD, self).__init__(*args, **kwargs)
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
for p, g in zip(params, gradients):
p -= self.lr * g # np.clip(g, -1., 1.)
super(SGD, self).update(params, gradients)
return params
[docs]class Momentum (Optimizer):
'''
Stochastic Gradient Descent with Momentum specialiation
Update the parameters according to the rule
.. code-block:: python
v = momentum * v - lr * gradient
parameter += v - learning_rate * gradient
Parameters
----------
momentum : float (default=0.9)
Momentum value
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, momentum=.9, *args, **kwargs):
super(Momentum, self).__init__(*args, **kwargs)
self.momentum = momentum
self.velocity = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
if self.velocity is None:
self.velocity = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (v, p, g) in enumerate(zip(self.velocity, params, gradients)):
v = self.momentum * v - self.lr * g # np.clip(g, -1., 1.)
p += v
self.velocity[i] = v
super(Momentum, self).update(params, gradients)
return params
[docs]class NesterovMomentum (Optimizer):
'''
Stochastic Gradient Descent with Nesterov Momentum specialiation
Update the parameters according to the rule
.. code-block:: python
v = momentum * v - lr * gradient
parameter += momentum * v - learning_rate * gradient
Parameters
----------
momentum : float (default=0.9)
Momentum value
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, momentum=.9, *args, **kwargs):
super(NesterovMomentum, self).__init__(*args, **kwargs)
self.momentum = momentum
self.velocity = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
if self.velocity is None:
self.velocity = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (v, p, g) in enumerate(zip(self.velocity, params, gradients)):
v = self.momentum * v - self.lr * g # np.clip(g, -1., 1.)
p += self.momentum * v - self.lr * g # np.clip(g, -1., 1.)
self.velocity[i] = v
super(NesterovMomentum, self).update(params, gradients)
return params
[docs]class Adagrad (Optimizer):
'''
Adagrad optimizer specialization
Update the parameters according to the rule
.. code-block:: python
c += gradient * gradient
parameter -= learning_rate * gradient / (sqrt(c) + epsilon)
Parameters
----------
epsilon : float (default=1e-6)
Precision parameter to overcome numerical overflows
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, epsilon=1e-6, *args, **kwargs):
super(Adagrad, self).__init__(*args, **kwargs)
self.epsilon = epsilon
self.cache = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
if self.cache is None:
self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (c, p, g) in enumerate(zip(self.cache, params, gradients)):
c += g * g
p -= self.lr * g / (np.sqrt(c) + self.epsilon)
self.cache[i] = c
super(Adagrad, self).update(params, gradients)
return params
[docs]class RMSprop (Optimizer):
'''
RMSprop optimization algorithm
Update the parameters according to the rule
.. code-block:: python
c = rho * c + (1. - rho) * gradient * gradient
parameter -= learning_rate * gradient / (sqrt(c) + epsilon)
Parameters
----------
rho : float (default=0.9)
Decay factor
epsilon : float (default=1e-6)
Precision parameter to overcome numerical overflows
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, rho=.9, epsilon=1e-6, *args, **kwargs):
super(RMSprop, self).__init__(*args, **kwargs)
self.rho = rho
self.epsilon = epsilon
self.cache = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
if self.cache is None:
self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (c, p, g) in enumerate(zip(self.cache, params, gradients)):
c = self.rho * c + (1 - self.rho) * g * g
p -= (self.lr * g / (np.sqrt(c) + self.epsilon))
self.cache[i] = c
super(RMSprop, self).update(params, gradients)
return params
[docs]class Adadelta (Optimizer):
'''
AdaDelta optimization algorithm
Update the parameters according to the rule
.. code-block:: python
c = rho * c + (1. - rho) * gradient * gradient
update = gradient * sqrt(d + epsilon) / (sqrt(c) + epsilon)
parameter -= learning_rate * update
d = rho * d + (1. - rho) * update * update
Parameters
----------
rho : float (default=0.9)
Decay factor
epsilon : float (default=1e-6)
Precision parameter to overcome numerical overflows
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, rho=0.9, epsilon=1e-6, *args, **kwargs):
super(Adadelta, self).__init__(*args, **kwargs)
self.rho = rho
self.epsilon = epsilon
self.cache = None
self.delta = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
if self.cache is None:
self.cache = [np.zeros(shape=p.shape, dtype=float) for p in params]
if self.delta is None:
self.delta = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (c, d, p, g) in enumerate(zip(self.cache, self.delta, params, gradients)):
c = self.rho * c + (1 - self.rho) * g * g
update = g * np.sqrt(d + self.epsilon) / np.sqrt(c + self.epsilon)
p -= self.lr * update
d = self.rho * d + (1 - self.rho) * update * update
self.cache[i] = c
self.delta[i] = d
super(Adadelta, self).update(params, gradients)
return params
[docs]class Adam (Optimizer):
'''
Adam optimization algorithm
Update the parameters according to the rule
.. code-block:: python
at = learning_rate * sqrt(1 - B2**iterations) / (1 - B1**iterations)
m = B1 * m + (1 - B1) * gradient
v = B2 * m + (1 - B2) * gradient * gradient
parameter -= at * m / (sqrt(v) + epsilon)
Parameters
----------
beta1 : float (default=0.9)
B1 factor
beta2 : float (default=0.999)
B2 factor
epsilon : float (default=1e-8)
Precision parameter to overcome numerical overflows
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):
super(Adam, self).__init__(*args, **kwargs)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.ms = None
self.vs = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
a_t = self.lr * np.sqrt(1 - np.power(self.beta2, self.iterations)) / \
(1 - np.power(self.beta1, self.iterations))
if self.ms is None:
self.ms = [np.zeros(shape=p.shape, dtype=float) for p in params]
if self.vs is None:
self.vs = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, gradients)):
m = self.beta1 * m + (1 - self.beta1) * g
v = self.beta2 * v + (1 - self.beta2) * g * g
p -= a_t * m / (np.sqrt(v) + self.epsilon)
self.ms[i] = m
self.vs[i] = v
super(Adam, self).update(params, gradients)
return params
[docs]class Adamax (Optimizer):
'''
Adamax optimization algorithm
Update the parameters according to the rule
.. code-block:: python
at = learning_rate / (1 - B1**iterations)
m = B1 * m + (1 - B1) * gradient
v = max(B2 * v, abs(gradient))
parameter -= at * m / (v + epsilon)
Parameters
----------
beta1 : float (default=0.9)
B1 factor
beta2 : float (default=0.999)
B2 factor
epsilon : float (default=1e-8)
Precision parameter to overcome numerical overflows
*args : list
Class specialization variables.
**kwargs : dict
Class Specialization variables.
'''
def __init__ (self, beta1=0.9, beta2=0.999, epsilon=1e-8, *args, **kwargs):
super(Adamax, self).__init__(*args, **kwargs)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.ms = None
self.vs = None
[docs] def update (self, params, gradients):
'''
Update the given parameters according to the class optimization algorithm
Parameters
----------
params : list
List of parameters to update
gradients : list
List of corresponding gradients
Returns
-------
params : list
The updated parameters
'''
a_t = self.lr / (1 - np.power(self.beta1, self.iterations))
if self.ms is None:
self.ms = [np.zeros(shape=p.shape, dtype=float) for p in params]
if self.vs is None:
self.vs = [np.zeros(shape=p.shape, dtype=float) for p in params]
for i, (m, v, p, g) in enumerate(zip(self.ms, self.vs, params, gradients)):
m = self.beta1 * m + (1 - self.beta1) * g
v = np.maximum(self.beta2 * v, np.abs(g))
p -= a_t * m / (v + self.epsilon)
self.ms[i] = m
self.vs[i] = v
super(Adamax, self).update(params, gradients)
return params