Source code for convis.optimizer

"""

Optimizer classes in addition to the ones provided by
`torch.optim`.

The Optimizers used here assume that they estimate one 
set of parameters. If the model should be fitted to some
data at one time and to other data at another time, a new
instance of the optimizer should be used.

You can set the optimizer of a model directly for that:

.. code::
    python

    import convis
    m = convis.LNLN()
    m.set_optimizer.LBFGS()
    m.optimize(input_a, goal_a)
    a_optim = m._optimizer # store the optimizer 
    m.set_optimizer.LBFGS() # initialize a new optimizer
    m.optimize(input_b, goal_b) # optimizing with the new optimizer
    m._optimizer = a_optim # using the first optimizer again

But this method can leave the optimizer confused (ie. it might not
work as intended), as state of the model and the parameters are
changed by running the second optimizer on some other input.

To use the same model for two different fitting processes
for two different processes that have to be estimated,
it is recommended to backup all relevant information and
to restore it when returning to fitting a previous process.

To do that there are three options:
    - using `v = model.get_all()` to retrieve the information into a variable and `model.set_all(v)` to restore it
    - using `model.push_all()` to push the information onto a stack within the model and `model.pop_all()` to retrieve it. With this method the values  can only be restored once, unless pushed again onto the stack.
    - using `model.store_all(some_name)` to store the information under a certain name and retrieving it with `model.retrieve_all(some_name)`, which can be used more than once and does not rely on user managed variables.

.. code::
    python

    import convis
    m = convis.LNLN()
    m.store_all('init') # stores state, parameter values and optimizer under a name
    m.set_optimizer.LBFGS()
    m.optimize(input_a, goal_a)
    m.push_all() # alternatively, you can save the optimizer, 
    # state and parameters onto a stack (optimizers will 
    # mostly assume that the parameters are not changed
    # between steps, but this differs per algorithm)
    m.retrieve_all('init') # retrieves state, parameter values and optimizer from before
    m.set_optimizer.LBFGS() # initialize a new optimizer
    m.optimize(input_b, goal_b) # optimizing with the new optimizer
    m.pop_all() # returning to the previous parameters, state and optimizer

"""

from torch.optim.optimizer import Optimizer
from collections import defaultdict
import numpy as np
import torch

[docs]class FiniteDifferenceGradientOptimizer(Optimizer):
    """
        Quasi-Newton method with a finite difference approximation
        of 2nd order gradient.
    """
    def __init__(self, params, **kwargs):
        defaults = kwargs
        self.grads = defaultdict(list)
        self.values = defaultdict(list)
        super(FiniteDifferenceGradientOptimizer, self).__init__(params, defaults)        
[docs]    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                value = p.data
                grad = p.grad.data
                self.grads[p].append(grad.numpy().copy())
                self.values[p].append(value.numpy().copy())
                
                if len(self.grads[p]) > 1:
                    estimate = np.zeros_like(self.values[p][-1])
                    normalization_term = 0.0
                    for i in range(1,len(self.values[p])):
                        x0 = (self.values[p][i-1] + self.grads[p][i-1]
                                                  * (self.values[p][i-1]-self.values[p][i])
                                                  / (self.grads[p][i]-self.grads[p][i-1])
                             )
                        weight = np.sqrt(np.nanmean((self.grads[p][i]-self.grads[p][i-1])**2))
                        if np.isnan(weight) or weight in [np.inf, np.nan] or weight < 0.0001:
                            # if weight is wrong, we won't deal with this
                            continue
                        #print weight
                        x0[self.grads[p][-1] == self.grads[p][-2]] = 0.0
                        x0[np.isnan(x0)] = 0.0
                        estimate += x0.reshape(value.shape)*weight
                        normalization_term += weight
                    p.data = torch.Tensor(estimate/float(normalization_term))
                    #else:
                    #    print "Last values are the same!"
                else:
                    estimate = value - 0.1 * grad
                    p.data.add_(-grad/grad.std())
        return loss

[docs]class CautiousLBFGS(Optimizer):
    """
        Executes the LBFGS optimizer, but chooses new starting
        values if the method is instable due to the closeness
        to the true value.
    """
    def __init__(self, params, **kwargs):
        defaults = kwargs
        self.grads = defaultdict(list)
        self.values = defaultdict(list)
        super(FiniteDifferenceGradientOptimizer, self).__init__(params, defaults)        
[docs]    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                value = p.data
                grad = p.grad.data
                self.grads[p].append(grad.numpy().copy())
                self.values[p].append(value.numpy().copy())