Merge pull request #70 from X-DataInitiative/TICK-363-solvers-docstrings

TICK-363 Improvement of the solvers docstrings
X-DataInitiative · Oct 26, 2017 · 7872bdc · 7872bdc
2 parents e1cdaf1 + 88fc4a9
commit 7872bdc
Show file tree

Hide file tree

Showing 15 changed files with 900 additions and 337 deletions.
diff --git a/doc/modules/api.rst b/doc/modules/api.rst
@@ -235,6 +235,7 @@ Batch solvers
    optim.solver.AGD
    optim.solver.BFGS
    optim.solver.GFB
+   optim.solver.SCPG
 
 Stochastic solvers
 ------------------
@@ -245,9 +246,10 @@ Stochastic solvers
    :template: class.rst
 
    optim.solver.SGD
+   optim.solver.AdaGrad
    optim.solver.SVRG
+   optim.solver.SAGA
    optim.solver.SDCA
-   optim.solver.AdaGrad
 
 History
 -------

diff --git a/doc/modules/optim.rst b/doc/modules/optim.rst
@@ -487,7 +487,7 @@ and :math:`+\infty` otherwise).
 Note that depending on the problem, :math:`g` might actually be used only a subset of
 entries of :math:`w`.
 For instance, for generalized linear models, :math:`w` contains the model weights and
-an intercept, which is not penalized, see :ref:`generalized linear models <optim-model-glm>`.
+an intercept, which is not penalized, see :ref:`generalized linear models <linear-models>`.
 Indeed, in all ``prox`` classes, an optional ``range`` parameter is available, to apply
 the regularization only to a subset of entries of :math:`w`.
 
@@ -703,9 +703,11 @@ Proximal gradient descent                                :class:`GD <tick.optim.
 Accelerated proximal gradient descent                    :class:`AGD <tick.optim.solver.AGD>`
 Broyden, Fletcher, Goldfarb, and Shannon (quasi-newton)  :class:`BFGS <tick.optim.solver.BFGS>`
 Self-Concordant Proximal Gradient Descent                :class:`SCPG <tick.optim.solver.SCPG>`
+Generalized Forward-Backward                             :class:`GFB <tick.optim.solver.GFB>`
 Stochastic Gradient Descent                              :class:`SGD <tick.optim.solver.SGD>`
 Adaptive Gradient Descent solver                         :class:`AdaGrad <tick.optim.solver.AdaGrad>`
 Stochastic Variance Reduced Descent                      :class:`SVRG <tick.optim.solver.SVRG>`
+Stochastic Averaged Gradient Descent                     :class:`SAGA <tick.optim.solver.SAGA>`
 Stochastic Dual Coordinate Ascent                        :class:`SDCA <tick.optim.solver.SDCA>`
 =======================================================  ========================================
 

diff --git a/tick/inference/hawkes_em.py b/tick/inference/hawkes_em.py
@@ -276,7 +276,7 @@ def score(self, events=None, end_times=None, baseline=None, kernel=None):
             Baseline vector for which the score is measured
             If `None` baseline obtained during fitting is used
 
-        kernel : `None` or `np.ndarray', shape=(n_nodes, n_nodes, kernel_size), default=None
+        kernel : `None` or `np.ndarray`, shape=(n_nodes, n_nodes, kernel_size), default=None
             Used to force start values for kernel parameter
             If `None` kernel obtained during fitting is used
 

diff --git a/tick/optim/solver/adagrad.py b/tick/optim/solver/adagrad.py
@@ -7,66 +7,125 @@
 
 
 class AdaGrad(SolverFirstOrderSto):
-    """
-    Adaptive Gradient Descent solver
-
-    Based on the works by:
-
-    Duchi, J., Hazan, E., & Singer, Y. (2011).
-    Adaptive Subgradient Methods for Online Learning and
-    Stochastic Optimization. Journal of Machine Learning Research.
+    """Adaptive stochastic gradient descent solver
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable and separable, namely
+
+    .. math::
+        g(w) = \\sum_{j=1}^d g_j(w_j)
+
+    where :math:`g_j` are prox-capable scalar functions of a single coordinate
+    :math:`w_j` of the vector of weights :math:`w \\in \\mathbb R^d`. Function
+    :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method). The given prox must be, as
+    explained above, separable.
+    One iteration of :class:`AdaGrad <tick.optim.solver.AdaGrad>` corresponds to
+    the following iteration applied ``epoch_size`` times:
+
+    .. math::
+        \\begin{align*}
+        &\\text{for } j=1, \\ldots, d \\; \\text{ do the following:} \\\\
+        & \\quad g_j \\gets ( \\nabla f_i(w) )_j \\\\
+        & \\quad d_j \gets d_j + g_j^2 \\\\
+        & \\quad w_j \\gets w_j - \\frac{\eta}{\\sqrt{d_j + 10^{-6}}} \\; g_j \\\\
+        & \\quad w_j \\gets \\mathrm{prox}_{\\eta_j g_j}(w_j)
+        \\end{align*}
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration, where :math:`\\eta` that can be tuned with ``step``.
+    The seed of the random number generator for generation of samples :math:`i`
+    can be seeded with ``seed``.
+    The iterations stop whenever tolerance ``tol`` is achieved, or after
+    ``max_iter`` epochs (namely ``max_iter``:math:`\\times` ``epoch_size``).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver.
 
     Parameters
     ----------
-    step : `float` default=0.01
-        Step-size of the algorithm
+    step : `float`, default=1e-2
+        Step-size parameter, the most important parameter of the solver.
+        A try-an-improve approach should be used.
+
+    tol : `float`, default=1e-10
+        The tolerance of the solver (iterations stop when the stopping
+        criterion is below it)
 
-    epoch_size : `int`
-        Epoch size
+    max_iter : `int`, default=100
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
-    rand_type : `str`
-        Type of random sampling
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
 
-        * if ``"unif"`` samples are uniformly drawn among all possibilities
-        * if ``"perm"`` a random permutation of all possibilities is
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
           generated and samples are sequentially taken from it. Once all of
           them have been taken, a new random permutation is generated
 
-    tol : `float`, default=0
-        The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
-
-    max_iter : `int`
-        Maximum number of iterations of the solver
-
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
-    print_every : `int`, default = 10
+    print_every : `int`, default=10
         Print history information every time the iteration number is a
-        multiple of ``print_every``
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
-    seed : `int`
+    seed : `int`, default=-1
         The seed of the random sampling. If it is negative then a random seed
         (different at each run) will be chosen.
 
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
+
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * J. Duchi, E. Hazan, Y. Singer, Adaptive Subgradient Methods for Online
+      Learning and Stochastic Optimization, *Journal of Machine Learning
+      Research* (2011)
     """
 
-    def __init__(self, step: float = 0.01, epoch_size: int = None,
-                 rand_type: str = "unif", tol: float = 0.,
+    def __init__(self, step: float = 1e-2, epoch_size: int = None,
+                 rand_type: str = 'unif', tol: float = 1e-10,
                  max_iter: int = 100, verbose: bool = True,
                  print_every: int = 10, record_every: int = 1,
                  seed: int = -1):

diff --git a/tick/optim/solver/agd.py b/tick/optim/solver/agd.py
@@ -8,53 +8,85 @@
 
 
 class AGD(SolverFirstOrder):
-    """
-    AGD (accelerated proximal gradient descent) algorithm.
+    """Accelerated proximal gradient descent
+
+    For the minimization of objectives of the form
+
+    .. math::
+        f(w) + g(w),
+
+    where :math:`f` has a smooth gradient and :math:`g` is prox-capable.
+    Function :math:`f` corresponds to the ``model.loss`` method of the model
+    (passed with ``set_model`` to the solver) and :math:`g` corresponds to
+    the ``prox.value`` method of the prox (passed with the ``set_prox`` method).
+    One iteration of :class:`AGD <tick.optim.solver.AGD>` is as follows:
+
+    .. math::
+        w^{k} &\\gets \\mathrm{prox}_{\\eta g} \\big(z^k - \\eta \\nabla f(z^k)
+        \\big) \\\\
+        t_{k+1} &\\gets \\frac{1 + \sqrt{1 + 4 t_k^2}}{2} \\\\
+        z^{k+1} &\\gets w^k + \\frac{t_k - 1}{t_{k+1}} (w^k - w^{k-1})
+
+    where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the
+    ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g}` is given by the
+    ``prox.call`` method. The step-size :math:`\\eta` can be tuned with
+    ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver.
 
     Parameters
     ----------
-    step : `float` default=None
-        Step-size of the algorithm. If ``linesearch=True``, this is the
-        first step-size to be used in the linesearch
-        (typically taken too large). Otherwise, it's the constant step
-        to be used along iterations.
-
-    tol : `float`, default=0.
+    step : `float`, default=None
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be automatically tuned as
+        ``step = 1 / model.get_lip_best()``. If ``linesearch=True``, this is
+        the first step-size to be used in the linesearch (that should be taken
+        as too large).
+
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
     max_iter : `int`, default=100
-        Maximum number of iterations of the solver
+        Maximum number of iterations of the solver.
 
     linesearch : `bool`, default=True
-        Use backtracking linesearch
-
-    linesearch_step_increase : `float`, default=2.
-        Factor of step increase when using linesearch
-
-    linesearch_step_decrease : `float`, default=0.5
-        Factor of step decrease when using linesearch
+        If `True`, use backtracking linesearch to tune the step automatically.
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
     print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
     record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
+
+    linesearch_step_increase : `float`, default=2.
+        Factor of step increase when using linesearch
+
+    linesearch_step_decrease : `float`, default=0.5
+        Factor of step decrease when using linesearch
 
     Attributes
     ----------
     model : `Model`
-        The model to solve
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
 
     time_start : `str`
         Start date of the call to ``solve()``
@@ -64,9 +96,15 @@ class AGD(SolverFirstOrder):
 
     time_end : `str`
         End date of the call to ``solve()``
+
+    References
+    ----------
+    * A. Beck and M. Teboulle, A fast iterative shrinkage-thresholding
+      algorithm for linear inverse problems,
+      *SIAM journal on imaging sciences*, 2009
     """
 
-    def __init__(self, step: float = None, tol: float = 0.,
+    def __init__(self, step: float = None, tol: float = 1e-10,
                  max_iter: int = 100, linesearch: bool = True,
                  linesearch_step_increase: float = 2.,
                  linesearch_step_decrease: float = 0.5,

diff --git a/tick/optim/solver/base/first_order.py b/tick/optim/solver/base/first_order.py
@@ -166,7 +166,7 @@ def _initialize_values(self, x0: np.ndarray = None, step: float = None,
         return tuple(result)
 
     def set_prox(self, prox: Prox):
-        """Set proximal operator in the solver.
+        """Set proximal operator in the solver
 
         Parameters
         ----------
@@ -181,7 +181,7 @@ def set_prox(self, prox: Prox):
         Notes
         -----
         In some solvers, ``set_model`` must be called before
-        ``set_prox``, otherwise and error might be raised.
+        ``set_prox``, otherwise and error might be raised
         """
         if not isinstance(prox, Prox):
             raise ValueError('Passed object of class %s is not a '
@@ -198,12 +198,12 @@ def _as_dict(self):
         return dd
 
     def objective(self, coeffs, loss: float=None):
-        """Compute the objective minimized by the solver at ``coeffs``
+        """Compute the objective function
 
         Parameters
         ----------
-        coeffs : `numpy.ndarray`, shape=(n_coeffs,)
-            The objective is computed at this point
+        coeffs : `np.array`, shape=(n_coeffs,)
+            Point where the objective is computed
 
         loss : `float`, default=`None`
             Gives the value of the loss if already known (allows to
@@ -234,12 +234,13 @@ def solve(self, x0=None, step=None):
             Starting point of the solver
 
         step : `float`, default=`None`
-            Step-size or learning rate for the solver
+            Step-size or learning rate for the solver. This can be tuned also
+            using the ``step`` attribute
 
         Returns
         -------
         output : `np.array`, shape=(n_coeffs,)
-            Obtained minimizer for the problem
+            Obtained minimizer for the problem, same as ``solution`` attribute
         """
         if self.model is None:
             raise ValueError('You must first set the model using '