# -*- coding: utf-8 -*-
'''This module defines gradient descent optimizers with adaptive learning rates.
'''
import numpy as np
import theano
import theano.tensor as TT
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from .base import Optimizer
from . import util
__all__ = ['RProp', 'RMSProp', 'ADAGRAD', 'ADADELTA', 'ESGD', 'Adam']
[docs]class RProp(Optimizer):
r'''Resilient backpropagation optimizer.
Parameters
----------
rprop_increase: float, optional (default 1.01)
Increase step sizes at this rate when the gradient sign stays the same.
rprop_decrease: float, optional (default 0.99)
Decrease step sizes at this rate when the gradient sign changes.
rprop_min_step: float, optional (default 0)
Minimum step size for any parameter.
rprop_max_step: float, optional (default 100)
Maximum step size for any parameter.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The RProp method takes small steps in parameter space using local gradient
information. RProp is unlike "vanilla" first-order techniques like
:class:`SGD <downhill.first_order.SGD>`, however, because only the signs of
the gradients are taken into account when making parameter updates. That is,
the step size for each parameter is independent of the magnitude of the
gradient for that parameter.
To accomplish this, RProp maintains a separate learning rate for every
parameter in the model, and adjusts this learning rate based on the
consistency of the sign of the gradient over time. Whenever two consecutive
gradients for a parameter have the same sign, the learning rate for that
parameter increases, and whenever the signs disagree, the learning rate
decreases. This has a similar effect to momentum-based stochastic gradient
methods but effectively maintains parameter-specific learning rates.
.. math::
\begin{eqnarray*}
&& \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1}
\frac{\partial\mathcal{L}}{\partial p} > 0 \\
&& \qquad \Delta_t = \min (\eta_+\Delta_{t−1}, \Delta_+) \\
&& \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1}
\frac{\partial\mathcal{L}}{\partial p} < 0 \\
&& \qquad \Delta_t = \max (\eta_-\Delta_{t−1}, \Delta_-) \\
&& \qquad \frac{\partial\mathcal{L}}{\partial p} = 0 \\
&& p_{t+1} = p_t − \mbox{sgn}\left(
\frac{\partial\mathcal{L}}{\partial p}\right) \Delta_t
\end{eqnarray*}
Here, :math:`s(\cdot)` is the sign function (i.e., returns -1 if its
argument is negative and 1 otherwise), :math:`\eta_-` and :math:`\eta_+` are
the amount to decrease (increase) the step size if the gradients disagree
(agree) in sign, and :math:`\Delta_+` and :math:`\Delta_-` are the maximum
and minimum step size.
The implementation here is actually the "iRprop-" variant of RProp described
in Algorithm 4 from [Igel00]_. This variant resets the running gradient
estimates to zero in cases where the previous and current gradients have
switched signs.
References
----------
.. [Ried92] M. Riedmiller & H. Braun. (1992) "Rprop - A Fast Adaptive
Learning Algorithm." In Proceedings of the International Symposium on
Computer and Information Science VII.
.. [Igel00] C. Igel & M. Hüsken. (2000) "Improving the Rprop Learning
Algorithm." In Proceedings of the Second International Symposium on
Neural Computation.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332
'''
def _prepare(self,
rprop_increase=1.01,
rprop_decrease=0.99,
rprop_min_step=0,
rprop_max_step=100,
**kwargs):
self.step_increase = util.as_float(rprop_increase)
self.step_decrease = util.as_float(rprop_decrease)
self.min_step = util.as_float(rprop_min_step)
self.max_step = util.as_float(rprop_max_step)
util.log_param('rprop_increase', rprop_increase)
util.log_param('rprop_decrease', rprop_decrease)
util.log_param('rprop_min_step', rprop_min_step)
util.log_param('rprop_max_step', rprop_max_step)
super(RProp, self)._prepare(**kwargs)
def _get_updates_for(self, param, grad):
grad_tm1 = util.shared_like(param, 'grad')
step_tm1 = util.shared_like(param, 'step', self.learning_rate.eval())
test = grad * grad_tm1
diff = TT.lt(test, 0)
steps = step_tm1 * (TT.eq(test, 0) +
TT.gt(test, 0) * self.step_increase +
diff * self.step_decrease)
step = TT.minimum(self.max_step, TT.maximum(self.min_step, steps))
grad = grad - diff * grad
yield param, TT.sgn(grad) * step
yield grad_tm1, grad
yield step_tm1, step
[docs]class ADAGRAD(Optimizer):
r'''ADAGRAD optimizer.
Parameters
----------
rms_regularizer: float, optional (default 1e-8)
Regularize the learning rate scaling factor by this :math:`\epsilon`.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The ADAGRAD method uses the same general strategy as all first-order
stochastic gradient methods, in the sense that these methods make small
parameter adjustments iteratively using local derivative information.
The difference with ADAGRAD is that as gradients are computed during each
parameter update, their squares are accumulated, and this accumulated value
is used to rescale the global learning rate :math:`\alpha` separately for
each parameter.
.. math::
\begin{eqnarray*}
g_{t+1} &=& g_t + \left(\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1}} + \epsilon}
\frac{\partial\mathcal{L}}{\partial p}
\end{eqnarray*}
Like the other adaptive learning methods, learning method effectively
maintains a sort of parameter-specific learning rate. Unlike
:class:`RMSProp` and :class:`ADADELTA`, however, in ADAGRAD, the gradient
magnitudes accumulate throughout training, which has the effect of scaling
the learning rate for each parameter, but also effectively anneals the
learning rate overall as training progresses.
In this implementation, the scale values are regularized (made less extreme)
by :math:`\epsilon`, which is specified using the ``rms_regularizer``
parameter.
References
----------
.. [Duch10] J. Duchi, E. Hazan, & Y. Singer (2010) “Adaptive subgradient
methods for online leaning and stochastic optimization.” Proc. Conference
on Learning Theory (COLT).
'''
def _prepare(self, rms_regularizer=1e-8, **kwargs):
self.epsilon = util.as_float(rms_regularizer)
util.log_param('rms_regularizer', rms_regularizer)
super(ADAGRAD, self)._prepare(**kwargs)
def _get_updates_for(self, param, grad):
g2_tm1 = util.shared_like(param, 'g2_acc')
g2_t = g2_tm1 + grad * grad
yield g2_tm1, g2_t
yield param, grad * self.learning_rate / TT.sqrt(g2_t + self.epsilon)
[docs]class RMSProp(Optimizer):
r'''RMSProp optimizer.
Parameters
----------
learning_rate: float, optional (default 1e-4)
Step size to take during optimization.
rms_halflife: float, optional (default 14)
Compute RMS gradient values using an exponentially weighted moving
average that decays with this halflife.
rms_regularizer: float, optional (default 1e-8)
Regularize RMS gradient values by this :math:`\epsilon`.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The RMSProp method uses the same general strategy as all first-order
stochastic gradient methods, in the sense that these methods make small
parameter adjustments iteratively using local derivative information.
The difference here is that as gradients are computed during each parameter
update, an exponentially-weighted moving average (EWMA) of gradient
magnitudes is maintained as well. At each update, the EWMA is used to
compute the root-mean-square (RMS) gradient value that's been seen in the
recent past. The actual gradient is normalized by this RMS scaling factor
before being applied to update the parameters. Intuitively, this makes
RMSProp take steps near 1 whenever the gradient is of constant magnitude,
and larger steps whenever the local scale of the gradient starts to
increase.
.. math::
\begin{eqnarray*}
f_{t+1} &=& \gamma f_t + (1 - \gamma) \frac{\partial\mathcal{L}}{\partial p} \\
g_{t+1} &=& \gamma g_t + (1 - \gamma) \left(
\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1} - f_{t+1}^2 + \epsilon}}
\frac{\partial\mathcal{L}}{\partial p}
\end{eqnarray*}
Like :class:`RProp`, this learning method effectively maintains a sort of
parameter-specific momentum value, but this method takes into account both
the sign and the magnitude of the gradient for each parameter.
In this algorithm, RMS values are regularized (made less extreme) by
:math:`\epsilon`, which is specified using the ``rms_regularizer`` keyword
argument.
The weight parameter :math:`\gamma` for the EWMA window is computed from the
``rms_halflife`` keyword argument, such that the actual EWMA weight varies
inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
2}{h}}`.
The implementation here is taken from [Grav13]_, equations (38)--(45).
Graves' implementation in particular seems to have introduced the
:math:`f_t` terms into the RMS computation; these terms appear to act as a
sort of momentum for the RMS values.
References
----------
.. [Grav13] A. Graves. (2013) "Generating Sequences With Recurrent Neural
Networks." http://arxiv.org/abs/1308.0850
'''
def _prepare(self, rms_halflife=14, rms_regularizer=1e-8, **kwargs):
self.ewma = util.as_float(np.exp(-np.log(2) / rms_halflife))
self.epsilon = util.as_float(rms_regularizer)
util.log_param('rms_halflife', rms_halflife)
util.log_param('rms_regularizer', rms_regularizer)
super(RMSProp, self)._prepare(**kwargs)
def _get_updates_for(self, param, grad):
g1_tm1 = util.shared_like(param, 'g1_ewma')
g2_tm1 = util.shared_like(param, 'g2_ewma')
g1_t = self.ewma * g1_tm1 + (1 - self.ewma) * grad
g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad
rms = TT.sqrt(g2_t - g1_t * g1_t + self.epsilon)
yield g1_tm1, g1_t
yield g2_tm1, g2_t
yield param, self.learning_rate * grad / rms
[docs]class ADADELTA(RMSProp):
r'''ADADELTA optimizer.
Parameters
----------
rms_halflife: float, optional (default 14)
Compute RMS gradient values using an exponentially weighted moving
average that decays with this halflife.
rms_regularizer: float, optional (default 1e-8)
Regularize RMS gradient values by this :math:`\epsilon`.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The ADADELTA method uses the same general strategy as all first-order
stochastic gradient methods, in the sense that these methods make small
parameter adjustments iteratively using local derivative information.
The difference with ADADELTA is that as gradients are computed during each
parameter update, an exponentially-weighted weighted moving average (EWMA)
gradient value, as well as an EWMA of recent parameter steps, are maintained
as well. The actual gradient is normalized by the ratio of the
root-mean-square (RMS) parameter step size to the RMS gradient magnitude.
.. math::
\begin{eqnarray*}
g_{t+1} &=& \gamma g_t + (1 - \gamma) \left(
\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\
v_{t+1} &=& \frac{\sqrt{x_t + \epsilon}}{\sqrt{g_{t+1} + \epsilon}}
\frac{\partial\mathcal{L}}{\partial p} \\
x_{t+1} &=& \gamma x_t + (1 - \gamma) v_{t+1}^2 \\
p_{t+1} &=& p_t - v_{t+1}
\end{eqnarray*}
Like :class:`RProp` and the :class:`RMSProp`--:class:`ESGD` family, this
learning method effectively maintains a sort of parameter-specific momentum
value. The primary difference between this method and :class:`RMSProp` is
that ADADELTA additionally incorporates a sliding window of RMS parameter
step sizes, (somewhat) obviating the need for a learning rate parameter.
In this implementation, the RMS values are regularized (made less extreme)
by :math:`\epsilon`, which is specified using the ``rms_regularizer``
parameter.
The weight parameter :math:`\gamma` for the EWMA window is computed from the
``rms_halflife`` keyword argument, such that the actual EWMA weight varies
inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
2}{h}}`.
References
----------
.. [Zeil12] M. Zeiler. (2012) "ADADELTA: An adaptive learning rate method."
http://arxiv.org/abs/1212.5701
'''
def _get_updates_for(self, param, grad):
x2_tm1 = util.shared_like(param, 'x2_ewma')
g2_tm1 = util.shared_like(param, 'g2_ewma')
g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad
delta = grad * TT.sqrt(x2_tm1 + self.epsilon) / TT.sqrt(g2_t + self.epsilon)
x2_t = self.ewma * x2_tm1 + (1 - self.ewma) * delta * delta
yield g2_tm1, g2_t
yield x2_tm1, x2_t
yield param, delta
[docs]class ESGD(RMSProp):
r'''Equilibrated SGD computes a diagonal Hessian preconditioner.
Parameters
----------
hv_method: {'rop', 'lop', 'grad'}, optional
The Hv (Hessian-vector) product will be computed using the given method.
The default is to use 'rop'.
learning_rate: float, optional (default 1e-4)
Step size to take during optimization.
rms_halflife: float, optional (default 14)
Compute RMS gradient values using an exponentially weighted moving
average that decays with this halflife.
rms_regularizer: float, optional (default 1e-8)
Regularize RMS gradient values by this :math:`\epsilon`.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The ESGD method uses the same general strategy as all first-order
stochastic gradient methods, in the sense that these methods make small
parameter adjustments iteratively using local derivative information.
The difference here is that as gradients are computed during each parameter
update, an exponentially-weighted moving average (EWMA) of estimates of the
diagonal of the Hessian (the matrix of second derivatives) is maintained as
well. At each update, the EWMA is used to compute the root-mean-square (RMS)
diagonal value that's been seen in the recent past. The actual gradient is
scaled by the inverse of this diagonal preconditioner before being applied
to update the parameters. Intuitively, this causes the algorithm to
"reshape" the loss function in parameter space, such that directions of
steep gradient (i.e., large diagonal values) and directions of shallow
gradient (i.e., small diagonal values) are scaled to be approximately the
same slope.
The diagonal estimates are computed using a nice trick: A vector :math:`r
\sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled
randomly at each update step, and the value of :math:`Hr` is computed
symbolically. These vector values tend to approximate the diagonal of the
Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H`
does not need to be computed or stored.
.. math::
\begin{eqnarray*}
r &\sim& \mathcal{N}(0, 1) \\
Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\
D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\
p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}}
\frac{\partial\mathcal{L}}{\partial p}
\end{eqnarray*}
Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this
learning method effectively maintains a sort of parameter-specific learning
rate for each parameter in the loss.
In this implementation, :math:`\epsilon` regularizes the RMS values; it is
is specified using the ``rms_regularizer`` parameter.
The weight parameter :math:`\gamma` for the EWMA is computed from the
``rms_halflife`` keyword argument, such that the actual EWMA weight varies
inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln
2}{h}}`.
The primary difference between this implementation and the algorithm
described in the paper (see below) is the use of an EWMA to decay the
diagonal values over time, while in the paper the diagonal is divided by the
training iteration. The EWMA halflife should be set to something reasonably
large to ensure that this method emulates the method described in the
original paper.
References
----------
.. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp
and equilibrated adaptive learning rates for non-convex optimization."
http://arxiv.org/abs/1502.04390
'''
[docs] def __init__(self, *args, **kwargs):
self.rng = RandomStreams()
self.hv_method = kwargs.pop('hv_method', 'rop').lower()
assert self.hv_method in ('rop', 'lop', 'grad')
super(ESGD, self).__init__(*args, **kwargs)
def _get_updates_for(self, param, grad):
D_tm1 = util.shared_like(param, 'D_ewma')
v = self.rng.normal(param.shape)
if self.hv_method == 'rop':
Hv = TT.Rop(grad, param, v)
if self.hv_method == 'lop':
Hv = TT.Lop(grad, param, v)
if self.hv_method == 'grad':
Hv = TT.grad(TT.sum(grad * v), param)
D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv
denom = TT.sqrt(D_t) + self.epsilon
yield D_tm1, D_t
yield param, grad * self.learning_rate / denom
[docs]class Adam(RMSProp):
r'''Adam optimizer using unbiased gradient moment estimates.
Parameters
----------
learning_rate: float, optional (default 1e-4)
Step size to take during optimization.
beta1_decay: float, optional (default 1 - 1e-6)
Extend the :math:`\beta_1` halflife by this amount after every update.
beta1_halflife: float, optional (default 7)
Compute RMS gradient estimates using an exponentially weighted moving
average that decays with this halflife.
beta2_halflife: float, optional (default 69)
Compute squared-magnitude RMS gradient estimates using an exponentially
weighted moving average that decays with this halflife.
rms_regularizer: float, optional (default 1e-8)
Regularize RMS gradient values by this :math:`\epsilon`.
momentum: float, optional (default 0)
Momentum to apply to the updates, if any. Defaults to 0 (no momentum).
Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of
momentum.
nesterov: bool, optional (default False)
Set this to ``True`` to enable Nesterov-style momentum updates, whenever
``momentum`` is nonzero.
Notes
-----
The Adam method uses the same general strategy as all first-order
stochastic gradient methods, in the sense that these methods make small
parameter adjustments iteratively using local derivative information.
The difference here is that as gradients are computed during each parameter
update, exponentially-weighted moving averages (EWMAs) of (1) the first
moment of the recent gradient values and (2) the second moment of recent
gradient values are maintained as well. At each update, the step taken is
proportional to the ratio of the first moment to the second moment.
.. math::
\begin{eqnarray*}
\beta_1^t &=& \beta_1 \lambda^{t}
f_{t+1} &=& \beta_1^t f_t + (1 - \beta_1^t)
\frac{\partial\mathcal{L}}{\partial\theta} \\
g_{t+1} &=& \beta_2 g_t + (1 - \beta_2)
\left(\frac{\partial\mathcal{L}}{\partial\theta}\right)^2 \\
\theta_{t+1} &=& \theta_t -
\frac{f_{t+1} / (1 - \beta_1^t)}{\sqrt{g_{t+1} / (1 - \beta_2)} + \epsilon}
\end{eqnarray*}
Like all adaptive optimization algorithms, this optimizer effectively
maintains a sort of parameter-specific momentum value. It shares with
:class:`RMSProp` and :class:`ADADELTA` the idea of using an EWMA to track
recent quantities related to the stochastic gradient during optimization.
But the Adam method is unique in that it incorporates an explicit
computation to remove the bias from these estimates.
In this implementation, :math:`\epsilon` regularizes the RMS values and is
given using the ``rms_regularizer`` keyword argument. The weight parameters
:math:`\beta_1` and :math:`\beta_2` for the first and second EWMA windows
are computed from the ``beta1_halflife`` and ``beta2_halflife`` keyword
arguments, respectively, such that the actual EWMA weight varies inversely
with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The
decay :math:`\lambda` for the :math:`\beta_1` EWMA is provided by the
``beta1_decay`` keyword argument.
The implementation here is taken from Algorithm 1 of [King15]_.
References
----------
.. [King15] D. Kingma & J. Ba. (ICLR 2015) "Adam: A Method for
Stochastic Optimization." http://arxiv.org/abs/1412.6980
'''
def _prepare(self,
beta1_halflife=7,
beta2_halflife=69,
**kwargs):
self.beta1 = util.as_float(np.exp(-np.log(2) / beta1_halflife))
self.beta2 = util.as_float(np.exp(-np.log(2) / beta2_halflife))
super(Adam, self)._prepare(**kwargs)
def _get_updates_for(self, param, grad):
t_tm1 = theano.shared(np.cast['float32'](0), 't')
t_t = 1 + t_tm1
g1_tm1 = util.shared_like(param, 'g1_ewma')
g2_tm1 = util.shared_like(param, 'g2_ewma')
g1_t = self.beta1 * g1_tm1 + (1 - self.beta1) * grad
g2_t = self.beta2 * g2_tm1 + (1 - self.beta2) * grad * grad
numer = g1_t / (1 - self.beta1 ** t_t)
denom = TT.sqrt(g2_t / (1 - self.beta2 ** t_t))
yield t_tm1, t_t
yield g1_tm1, g1_t
yield g2_tm1, g2_t
yield param, self.learning_rate * numer / (denom + self.epsilon)