pretty inference

ferrine · twiecki · commit ba194cd27f6a · 2017-05-09T09:51:04.000+02:00
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
@@ -271,21 +271,28 @@ class ADVI(Inference):
     observed variables with different :code:`total_size` and iterate them independently
     during inference.  
 
-    For working with ADVI, we need to give 
+    For working with ADVI, we need to give
+    
     -   The probabilistic model
-        (:code:`model`), the three types of RVs (:code:`observed_RVs`,
+
+        :code:`model` with three types of RVs (:code:`observed_RVs`,
         :code:`global_RVs` and :code:`local_RVs`). 
     
     -   (optional) Minibatches
+
         The tensors to which mini-bathced samples are supplied are 
         handled separately by using callbacks in :code:`.fit` method 
         that change storage of shared theano variable or by :code:`pm.generator` 
         that automatically iterates over minibatches and defined beforehand. 
     
     -   (optional) Parameters of deterministic mappings
+
         They have to be passed along with other params to :code:`.fit` method 
         as :code:`more_obj_params` argument. 
     
+    
+    See Also
+    --------
     For more information concerning training stage please reference 
     :code:`pymc3.variational.opvi.ObjectiveFunction.step_function`
     
@@ -295,35 +302,34 @@ class ADVI(Inference):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
-    model : PyMC3 model for inference
-
-    cost_part_grad_scale : float or scalar tensor
+    model : :class:`Model` 
+        PyMC3 model for inference
+    cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-    scale_cost_to_minibatch : bool, default False
-        Scale cost to minibatch instead of full dataset
+    scale_cost_to_minibatch : `bool`
+        Scale cost to minibatch instead of full dataset, default False
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one    
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
+    -   Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
         and Blei, D. M. (2016). Automatic Differentiation Variational
         Inference. arXiv preprint arXiv:1603.00788.
 
-    - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
 
-    - Kingma, D. P., & Welling, M. (2014).
-      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    -   Kingma, D. P., & Welling, M. (2014).
+        Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
     def __init__(self, local_rv=None, model=None,
                  cost_part_grad_scale=1,
@@ -343,12 +349,12 @@ def from_mean_field(cls, mean_field):
 
         Parameters
         ----------
-        mean_field : MeanField
+        mean_field : :class:`MeanField`
             approximation to start with
 
         Returns
         -------
-        ADVI
+        :class:`ADVI`
         """
         if not isinstance(mean_field, MeanField):
             raise TypeError('Expected MeanField, got %r' % mean_field)
@@ -369,10 +375,9 @@ class FullRankADVI(Inference):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
-    model : PyMC3 model for inference
-
-    cost_part_grad_scale : float or scalar tensor
+    model : :class:`Model` 
+        PyMC3 model for inference
+    cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
@@ -383,21 +388,21 @@ class FullRankADVI(Inference):
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
+    -   Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
         and Blei, D. M. (2016). Automatic Differentiation Variational
         Inference. arXiv preprint arXiv:1603.00788.
 
-    - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
 
-    - Kingma, D. P., & Welling, M. (2014).
-      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    -   Kingma, D. P., & Welling, M. (2014).
+        Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
     def __init__(self, local_rv=None, model=None,
                  cost_part_grad_scale=1,
@@ -417,12 +422,12 @@ def from_full_rank(cls, full_rank):
 
         Parameters
         ----------
-        full_rank : FullRank
+        full_rank : :class:`FullRank`
             approximation to start with
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         if not isinstance(full_rank, FullRank):
             raise TypeError('Expected MeanField, got %r' % full_rank)
@@ -439,17 +444,17 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
 
         Parameters
         ----------
-        mean_field : MeanField
+        mean_field : :class:`MeanField`
             approximation to start with
 
-        Flags
-        -----
-        gpu_compat : bool
+        Other Parameters
+        ----------------
+        gpu_compat : `bool`
             use GPU compatible version or not
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         full_rank = FullRank.from_mean_field(mean_field, gpu_compat)
         inference = object.__new__(cls)
@@ -465,16 +470,16 @@ def from_advi(cls, advi, gpu_compat=False):
 
         Parameters
         ----------
-        advi : ADVI
+        advi : :class:`ADVI`
 
-        Flags
-        -----
+        Other Parameters
+        ----------------
         gpu_compat : bool
             use GPU compatible version or not
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         inference = cls.from_mean_field(advi.approx, gpu_compat)
         inference.hist = advi.hist
@@ -494,35 +499,37 @@ class SVGD(Inference):
     Input: A target distribution with density function :math:`p(x)`
         and a set of initial particles :math:`{x^0_i}^n_{i=1}`
     Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
+
     .. math::
 
         x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
         \hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
 
     Parameters
     ----------
-    n_particles : int
+    n_particles : `int`
         number of particles to use for approximation
-    jitter :
+    jitter : `float`
         noise sd for initial point
-    model : pm.Model
-    kernel : callable
+    model : :class:`Model`
+        PyMC3 model for inference
+    kernel : `callable`
         kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
     scale_cost_to_minibatch : bool, default False
         Scale cost to minibatch instead of full dataset
-    start : dict
+    start : `dict`
         initial point for inference
-    histogram : Empirical
+    histogram : :class:`Empirical`
         initialize SVGD with given Empirical approximation instead of default initial particles
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Qiang Liu, Dilin Wang (2016)
+    -   Qiang Liu, Dilin Wang (2016)
         Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
         arXiv:1608.04471
     """
@@ -546,26 +553,31 @@ def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, start=None
 
     Parameters
     ----------
-    n : int
+    n : `int`
         number of iterations
     local_rv : dict[var->tuple]
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    method : str or Inference
+    method : str or :class:`Inference`
         string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'}
-    model : Model
-    kwargs : kwargs for Inference.fit
-    frac : float
+    model : :class:`Model`
+        PyMC3 model for inference
+
+    Other Parameters
+    ----------------
+    frac : `float`
         if method is 'advi->fullrank_advi' represents advi fraction when training
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
+    kwargs : kwargs for :method:`Inference.fit`
+
     Returns
     -------
-    Approximation
+    :class:`Approximation`
     """
     if model is None:
         model = pm.modelcontext(model)