diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 4345fbdf6d..2f431ccffa 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -28,24 +28,25 @@ class MeanField(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    model : PyMC3 model for inference
-    start : Point
+    model : :class:`Model` 
+        PyMC3 model for inference
+    start : `Point`
         initial mean
-    cost_part_grad_scale : float or scalar tensor
+    cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-    scale_cost_to_minibatch : bool, default False
-        Scale cost to minibatch instead of full dataset
+    scale_cost_to_minibatch : `bool` 
+        Scale cost to minibatch instead of full dataset, default False
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
 
     References
     ----------   
-    Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
     """
@@ -121,10 +122,15 @@ class FullRank(Approximation):
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
+    
+    Other Parameters
+    ----------------
+    gpu_compat : bool
+        use GPU compatible version or not
 
     References
     ----------
-    Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
     """
@@ -211,17 +217,17 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
 
         Parameters
         ----------  
-        mean_field : MeanField
+        mean_field : :class:`MeanField`
             approximation to start with
 
-        Flags
-        -----
-        gpu_compat : bool
+        Other Parameters
+        ----------------
+        gpu_compat : `bool`
             use GPU compatible version or not
 
         Returns
         -------
-        FullRank
+        :class:`FullRank`
         """
         full_rank = object.__new__(cls)  # type: FullRank
         full_rank.gpu_compat = gpu_compat
@@ -247,15 +253,16 @@ class Empirical(Approximation):
 
     Parameters
     ----------
-    trace : MultiTrace
+    trace : :class:`MultiTrace`
     local_rv : dict[var->tuple]
         Experimental for Empirical Approximation
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    scale_cost_to_minibatch : bool, default False
-        Scale cost to minibatch instead of full dataset
-    model : PyMC3 model
+    scale_cost_to_minibatch : `bool` 
+        Scale cost to minibatch instead of full dataset, default False
+    model : :class:`Model` 
+        PyMC3 model for inference
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
@@ -356,15 +363,18 @@ def from_noise(cls, size, jitter=.01, local_rv=None,
 
         Parameters
         ----------
-        size : number of initial particles
-        jitter : initial sd
-        local_rv : dict
+        size : `int` 
+            number of initial particles
+        jitter : `float` 
+            initial sd
+        local_rv : `dict`
             mapping {model_variable -> local_variable}
             Local Vars are used for Autoencoding Variational Bayes
             See (AEVB; Kingma and Welling, 2014) for details
-        start : initial point
-        model : pm.Model
-            PyMC3 Model
+        start : `Point` 
+            initial point
+        model : :class:`Model`
+            PyMC3 model for inference
         seed : None or int
             leave None to use package global RandomStream or other
             valid value to create instance specific one
@@ -372,7 +382,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None,
 
         Returns
         -------    
-        Empirical
+        :class:`Empirical`
         """
         hist = cls(None, local_rv=local_rv, model=model, seed=seed, **kwargs)
         if start is None:
@@ -394,15 +404,16 @@ def sample_approx(approx, draws=100, include_transformed=True):
 
     Parameters
     ----------  
-    approx : Approximation
-    draws : int
+    approx : :class:`Approximation`
+        Approximation to sample from
+    draws : `int`
         Number of random samples.
-    include_transformed : bool
+    include_transformed : `bool`
         If True, transformed variables are also sampled. Default is True.
 
     Returns
     -------    
-    trace : pymc3.backends.base.MultiTrace
+    trace : class:`pymc3.backends.base.MultiTrace`
         Samples drawn from variational posterior.
     """
     if not isinstance(approx, Approximation):
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index d716b84597..64518a6dae 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -271,21 +271,28 @@ class ADVI(Inference):
     observed variables with different :code:`total_size` and iterate them independently
     during inference.  
 
-    For working with ADVI, we need to give 
+    For working with ADVI, we need to give
+    
     -   The probabilistic model
-        (:code:`model`), the three types of RVs (:code:`observed_RVs`,
+
+        :code:`model` with three types of RVs (:code:`observed_RVs`,
         :code:`global_RVs` and :code:`local_RVs`). 
     
     -   (optional) Minibatches
+
         The tensors to which mini-bathced samples are supplied are 
         handled separately by using callbacks in :code:`.fit` method 
         that change storage of shared theano variable or by :code:`pm.generator` 
         that automatically iterates over minibatches and defined beforehand. 
     
     -   (optional) Parameters of deterministic mappings
+
         They have to be passed along with other params to :code:`.fit` method 
         as :code:`more_obj_params` argument. 
     
+    
+    See Also
+    --------
     For more information concerning training stage please reference 
     :code:`pymc3.variational.opvi.ObjectiveFunction.step_function`
     
@@ -295,35 +302,34 @@ class ADVI(Inference):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
-    model : PyMC3 model for inference
-
-    cost_part_grad_scale : float or scalar tensor
+    model : :class:`Model` 
+        PyMC3 model for inference
+    cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-    scale_cost_to_minibatch : bool, default False
-        Scale cost to minibatch instead of full dataset
+    scale_cost_to_minibatch : `bool`
+        Scale cost to minibatch instead of full dataset, default False
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one    
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
+    -   Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
         and Blei, D. M. (2016). Automatic Differentiation Variational
         Inference. arXiv preprint arXiv:1603.00788.
 
-    - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
 
-    - Kingma, D. P., & Welling, M. (2014).
-      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    -   Kingma, D. P., & Welling, M. (2014).
+        Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
     def __init__(self, local_rv=None, model=None,
                  cost_part_grad_scale=1,
@@ -343,12 +349,12 @@ def from_mean_field(cls, mean_field):
 
         Parameters
         ----------
-        mean_field : MeanField
+        mean_field : :class:`MeanField`
             approximation to start with
 
         Returns
         -------
-        ADVI
+        :class:`ADVI`
         """
         if not isinstance(mean_field, MeanField):
             raise TypeError('Expected MeanField, got %r' % mean_field)
@@ -369,10 +375,9 @@ class FullRankADVI(Inference):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
-    model : PyMC3 model for inference
-
-    cost_part_grad_scale : float or scalar tensor
+    model : :class:`Model` 
+        PyMC3 model for inference
+    cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
@@ -383,21 +388,21 @@ class FullRankADVI(Inference):
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
+    -   Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
         and Blei, D. M. (2016). Automatic Differentiation Variational
         Inference. arXiv preprint arXiv:1603.00788.
 
-    - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
 
-    - Kingma, D. P., & Welling, M. (2014).
-      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    -   Kingma, D. P., & Welling, M. (2014).
+        Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
     def __init__(self, local_rv=None, model=None,
                  cost_part_grad_scale=1,
@@ -417,12 +422,12 @@ def from_full_rank(cls, full_rank):
 
         Parameters
         ----------
-        full_rank : FullRank
+        full_rank : :class:`FullRank`
             approximation to start with
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         if not isinstance(full_rank, FullRank):
             raise TypeError('Expected MeanField, got %r' % full_rank)
@@ -439,17 +444,17 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
 
         Parameters
         ----------
-        mean_field : MeanField
+        mean_field : :class:`MeanField`
             approximation to start with
 
-        Flags
-        -----
-        gpu_compat : bool
+        Other Parameters
+        ----------------
+        gpu_compat : `bool`
             use GPU compatible version or not
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         full_rank = FullRank.from_mean_field(mean_field, gpu_compat)
         inference = object.__new__(cls)
@@ -465,16 +470,16 @@ def from_advi(cls, advi, gpu_compat=False):
 
         Parameters
         ----------
-        advi : ADVI
+        advi : :class:`ADVI`
 
-        Flags
-        -----
+        Other Parameters
+        ----------------
         gpu_compat : bool
             use GPU compatible version or not
 
         Returns
         -------
-        FullRankADVI
+        :class:`FullRankADVI`
         """
         inference = cls.from_mean_field(advi.approx, gpu_compat)
         inference.hist = advi.hist
@@ -494,6 +499,7 @@ class SVGD(Inference):
     Input: A target distribution with density function :math:`p(x)`
         and a set of initial particles :math:`{x^0_i}^n_{i=1}`
     Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
+
     .. math::
 
         x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
@@ -501,28 +507,29 @@ class SVGD(Inference):
 
     Parameters
     ----------
-    n_particles : int
+    n_particles : `int`
         number of particles to use for approximation
-    jitter :
+    jitter : `float`
         noise sd for initial point
-    model : pm.Model
-    kernel : callable
+    model : :class:`Model`
+        PyMC3 model for inference
+    kernel : `callable`
         kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
     scale_cost_to_minibatch : bool, default False
         Scale cost to minibatch instead of full dataset
-    start : dict
+    start : `dict`
         initial point for inference
-    histogram : Empirical
+    histogram : :class:`Empirical`
         initialize SVGD with given Empirical approximation instead of default initial particles
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
 
     References
     ----------
-    - Qiang Liu, Dilin Wang (2016)
+    -   Qiang Liu, Dilin Wang (2016)
         Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
         arXiv:1608.04471
     """
@@ -546,26 +553,31 @@ def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, start=None
 
     Parameters
     ----------
-    n : int
+    n : `int`
         number of iterations
     local_rv : dict[var->tuple]
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    method : str or Inference
+    method : str or :class:`Inference`
         string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'}
-    model : Model
-    kwargs : kwargs for Inference.fit
-    frac : float
+    model : :class:`Model`
+        PyMC3 model for inference
+
+    Other Parameters
+    ----------------
+    frac : `float`
         if method is 'advi->fullrank_advi' represents advi fraction when training
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start : Point
+    start : `Point`
         starting point for inference
+    kwargs : kwargs for :method:`Inference.fit`
+
     Returns
     -------
-    Approximation
+    :class:`Approximation`
     """
     if model is None:
         model = pm.modelcontext(model)
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
index 38a3726fa1..a4f8de9bdd 100644
--- a/pymc3/variational/operators.py
+++ b/pymc3/variational/operators.py
@@ -12,6 +12,7 @@
 class KL(Operator):
     """
     Operator based on Kullback Leibler Divergence
+
     .. math::
 
         KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv
@@ -41,8 +42,9 @@ class KSD(Operator):
     Operator based on Kernelized Stein Discrepancy
 
     Input: A target distribution with density function :math:`p(x)`
-        and a set of initial particles :math:`{x^0_i}^n_{i=1}`
-    Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
+        and a set of initial particles :math:`\{x^0_i\}^n_{i=1}`
+    Output: A set of particles :math:`\{x_i\}^n_{i=1}` that approximates the target distribution.
+    
     .. math::
 
         x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
@@ -50,11 +52,12 @@ class KSD(Operator):
 
     Parameters
     ----------
-    approx : pm.Empirical
+    approx : :class:`pm.Empirical`
+        Empirical Approximation used for inference
 
     References
     ----------
-    - Qiang Liu, Dilin Wang (2016)
+    -   Qiang Liu, Dilin Wang (2016)
         Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
         arXiv:1608.04471
     """
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index f734c4d8d7..13dca544e5 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -5,7 +5,7 @@
 reveal the true nature of underlying problem. In some applications it can
 yield unreliable decisions.
 
-Recently on NIPS 2017 [OPVI](https://arxiv.org/abs/1610.09033) framework
+Recently on NIPS 2017 `OPVI  <https://arxiv.org/abs/1610.09033/>`_ framework
 was presented. It generalizes variational inverence so that the problem is
 build with blocks. The first and essential block is Model itself. Second is
 Approximation, in some cases :math:`log Q(D)` is not really needed. Necessity
@@ -68,8 +68,10 @@ class ObjectiveFunction(object):
 
     Parameters
     ----------
-    op : Operator
-    tf : TestFunction
+    op : :class:`Operator`
+        OPVI Functional operator 
+    tf : :class:`TestFunction`
+        OPVI TestFunction
     """
     def __init__(self, op, tf):
         self.op = op
@@ -85,7 +87,7 @@ def random(self, size=None):
 
         Parameters
         ----------
-        size : int
+        size : `int`
             number of samples from distribution
 
         Returns
@@ -101,26 +103,26 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize
 
         Parameters
         ----------
-        obj_n_mc : int
+        obj_n_mc : `int`
             Number of monte carlo samples used for approximation of objective gradients
-        tf_n_mc : int
+        tf_n_mc : `int`
             Number of monte carlo samples used for approximation of test function gradients
         obj_optimizer : function (loss, params) -> updates
             Optimizer that is used for objective params
         test_optimizer : function (loss, params) -> updates
             Optimizer that is used for test function params
-        more_obj_params : list
+        more_obj_params : `list`
             Add custom params for objective optimizer
-        more_tf_params : list
+        more_tf_params : `list`
             Add custom params for test function optimizer
-        more_updates : dict
+        more_updates : `dict`
             Add custom updates to resulting updates
-        more_replacements : dict
+        more_replacements : `dict`
             Apply custom replacements before calculating gradients
 
         Returns
         -------
-        ObjectiveUpdates
+        :class:`ObjectiveUpdates`
         """
         if more_obj_params is None:
             more_obj_params = []
@@ -182,36 +184,37 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None,
         """Step function that should be called on each optimization step.
 
         Generally it solves the following problem:
+        
         .. math::
 
                 \textbf{\lambda^{*}} = \inf_{\lambda} \sup_{\theta} t(\mathbb{E}_{\lambda}[(O^{p,q}f_{\theta})(z)])
 
         Parameters
         ----------
-        obj_n_mc : int
+        obj_n_mc : `int`
             Number of monte carlo samples used for approximation of objective gradients
-        tf_n_mc : int
+        tf_n_mc : `int`
             Number of monte carlo samples used for approximation of test function gradients
         obj_optimizer : function (loss, params) -> updates
             Optimizer that is used for objective params
         test_optimizer : function (loss, params) -> updates
             Optimizer that is used for test function params
-        more_obj_params : list
+        more_obj_params : `list`
             Add custom params for objective optimizer
-        more_tf_params : list
+        more_tf_params : `list`
             Add custom params for test function optimizer
-        more_updates : dict
+        more_updates : `dict`
             Add custom updates to resulting updates
-        score : bool
+        score : `bool`
             calculate loss on each step? Defaults to False for speed
-        fn_kwargs : dict
+        fn_kwargs : `dict`
             Add kwargs to theano.function (e.g. `{'profile': True}`)
-        more_replacements : dict
+        more_replacements : `dict`
             Apply custom replacements before calculating gradients
 
         Returns
         -------
-        theano.function
+        `theano.function`
         """
         if fn_kwargs is None:
             fn_kwargs = {}
@@ -237,11 +240,11 @@ def score_function(self, sc_n_mc=None, more_replacements=None, fn_kwargs=None):
 
         Parameters
         ----------
-        sc_n_mc : int
+        sc_n_mc : `int`
             number of scoring MC samples
         more_replacements:
             Apply custom replacements before compiling a function
-        fn_kwargs:
+        fn_kwargs: `dict`
             arbitrary kwargs passed to theano.function
 
         Returns
@@ -278,10 +281,11 @@ class Operator(object):
 
     Parameters
     ----------
-    approx : Approximation
+    approx : :class:`Approximation`
+        an approximation instance
 
-    Subclassing
-    -----------
+    Notes
+    -----
     For implementing Custom operator it is needed to define :code:`.apply(f)` method
     """
 
@@ -326,19 +330,21 @@ def logq_norm(self, z):
 
     def apply(self, f):   # pragma: no cover
         """Operator itself
+        
         .. math::
 
             (O^{p,q}f_{\theta})(z)
 
         Parameters
         ----------
-        f : TestFunction or None if not required
+        f : :class:`TestFunction` or None if not required
             function that takes `z = self.input` and returns
             same dimensional output
 
         Returns
         -------
-        symbolically applied operator
+        tt.TensorVariable
+            symbolically applied operator
         """
         raise NotImplementedError
 
@@ -426,7 +432,8 @@ def _setup(self, dim):
 
         Parameters
         ----------
-        dim : int dimension of posterior distribution
+        dim : int 
+            dimension of posterior distribution
         """
         pass
 
@@ -445,12 +452,11 @@ class Approximation(object):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
-    model : PyMC3 model for inference
-
+    model : :class:`Model` 
+        PyMC3 model for inference
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
@@ -463,10 +469,11 @@ class Approximation(object):
         leave None to use package global RandomStream or other
         valid value to create instance specific one
 
-    Subclassing
-    -----------
+    Notes
+    -----
     Defining an approximation needs
     custom implementation of the following methods:
+
         - :code:`.create_shared_params(**kwargs)`
             Returns {dict|list|theano.shared}
 
@@ -481,19 +488,21 @@ class Approximation(object):
             Returns Scalar
 
     You can also override the following methods:
+
         - :code:`._setup(**kwargs)`
             Do some specific stuff having :code:`kwargs` before calling :code:`.create_shared_params`
 
         - :code:`.check_model(model, **kwargs)`
             Do some specific check for model having :code:`kwargs`
 
-    Notes
-    -----
+    See Also
+    --------
     :code:`kwargs` mentioned above are supplied as additional arguments
     for :code:`Approximation.__init__`
 
     There are some defaults class attributes for approximation classes that can be
     optionally overriden.
+
         - :code:`initial_dist_name`
             string that represents name of the initial distribution.
             In most cases if will be `uniform` or `normal`
@@ -553,7 +562,7 @@ def seed(self, seed=None):
 
         Parameters
         ----------
-        seed : int
+        seed : `int`
         """
         self._seed = seed
         self._rng.seed(seed)
@@ -609,16 +618,16 @@ def construct_replacements(self, include=None, exclude=None,
 
         Parameters
         ----------
-        include : list
+        include : `list`
             latent variables to be replaced
-        exclude : list
+        exclude : `list`
             latent variables to be excluded for replacements
-        more_replacements : dict
+        more_replacements : `dict`
             add custom replacements to graph, e.g. change input source
 
         Returns
         -------
-        dict
+        `dict`
             Replacements
         """
         if include is not None and exclude is not None:
@@ -647,11 +656,11 @@ def apply_replacements(self, node, deterministic=False,
         deterministic : bool
             whether to use zeros as initial distribution
             if True - zero initial point will produce constant latent variables
-        include : list
+        include : `list`
             latent variables to be replaced
-        exclude : list
+        exclude : `list`
             latent variables to be excluded for replacements
-        more_replacements : dict
+        more_replacements : `dict`
             add custom replacements to graph, e.g. change input source
 
         Returns
@@ -674,7 +683,7 @@ def sample_node(self, node, size=100,
         node : Theano Variables (or Theano expressions)
         size : scalar
             number of samples
-        more_replacements : dict
+        more_replacements : `dict`
             add custom replacements to graph, e.g. change input source
 
         Returns
@@ -716,13 +725,16 @@ def initial(self, size, no_rand=False, l=None):
 
         Parameters
         ----------
-        size : int - number of samples
-        no_rand : bool - return zeros if True
-        l : length of sample, defaults to latent space dim
+        size : `int` 
+            number of samples
+        no_rand : `bool`
+            return zeros if True
+        l : `int` 
+            length of sample, defaults to latent space dim
 
         Returns
         -------
-        Tensor
+        `tt.TensorVariable`
             sampled latent space shape == size + latent_dim
         """
 
@@ -754,8 +766,10 @@ def random_local(self, size=None, no_rand=False):
 
         Parameters
         ----------
-        size : number of samples from distribution
-        no_rand : whether use deterministic distribution
+        size : `scalar`
+            number of samples from distribution
+        no_rand : `bool`
+            whether use deterministic distribution
 
         Returns
         -------
@@ -771,8 +785,10 @@ def random_global(self, size=None, no_rand=False):  # pragma: no cover
 
         Parameters
         ----------
-        size : number of samples from distribution
-        no_rand : whether use deterministic distribution
+        size : `scalar`
+            number of samples from distribution
+        no_rand : `bool`
+            whether use deterministic distribution
 
         Returns
         -------
@@ -785,8 +801,10 @@ def random(self, size=None, no_rand=False):
 
         Parameters
         ----------
-        size : number of samples from distribution
-        no_rand : whether use deterministic distribution
+        size : `scalar`
+            number of samples from distribution
+        no_rand : `bool`
+            whether use deterministic distribution
 
         Returns
         -------
@@ -816,8 +834,10 @@ def random_fn(self):
 
         Parameters
         ----------
-        size : number of samples from distribution
-        no_rand : whether use deterministic distribution
+        size : `int`
+            number of samples from distribution
+        no_rand : `bool`
+            whether use deterministic distribution
 
         Returns
         -------
@@ -844,14 +864,14 @@ def sample(self, draws=1, include_transformed=False):
 
         Parameters
         ----------
-        draws : int
+        draws : `int`
             Number of random samples.
-        include_transformed : bool
+        include_transformed : `bool`
             If True, transformed variables are also sampled. Default is False.
 
         Returns
         -------
-        trace : pymc3.backends.base.MultiTrace
+        trace : :class:`pymc3.backends.base.MultiTrace`
             Samples drawn from variational posterior.
         """
         vars_sampled = get_default_varnames(self.model.unobserved_RVs,
@@ -910,15 +930,17 @@ def view(self, space, name, reshape=True):
 
         Parameters
         ----------
-        space : space to take view of variable from
-        name : str
+        space : matrix or vector
+            space to take view of variable from
+        name : `str`
             name of variable
-        reshape : bool
+        reshape : `bool`
             whether to reshape variable from vectorized view
 
         Returns
         -------
-        variable view
+        (reshaped) slice of matrix
+            variable view
         """
         theano_is_here = isinstance(space, tt.TensorVariable)
         slc = self._view[name].slc