diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py index 4345fbdf6d..2f431ccffa 100644 --- a/pymc3/variational/approximations.py +++ b/pymc3/variational/approximations.py @@ -28,24 +28,25 @@ class MeanField(Approximation): mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - model : PyMC3 model for inference - start : Point + model : :class:`Model` + PyMC3 model for inference + start : `Point` initial mean - cost_part_grad_scale : float or scalar tensor + cost_part_grad_scale : `scalar` Scaling score part of gradient can be useful near optimum for archiving better convergence properties. Common schedule is 1 at the start and 0 in the end. So slow decay will be ok. See (Sticking the Landing; Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016) for details - scale_cost_to_minibatch : bool, default False - Scale cost to minibatch instead of full dataset + scale_cost_to_minibatch : `bool` + Scale cost to minibatch instead of full dataset, default False seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one References ---------- - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 + - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf """ @@ -121,10 +122,15 @@ class FullRank(Approximation): seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one + + Other Parameters + ---------------- + gpu_compat : bool + use GPU compatible version or not References ---------- - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 + - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf """ @@ -211,17 +217,17 @@ def from_mean_field(cls, mean_field, gpu_compat=False): Parameters ---------- - mean_field : MeanField + mean_field : :class:`MeanField` approximation to start with - Flags - ----- - gpu_compat : bool + Other Parameters + ---------------- + gpu_compat : `bool` use GPU compatible version or not Returns ------- - FullRank + :class:`FullRank` """ full_rank = object.__new__(cls) # type: FullRank full_rank.gpu_compat = gpu_compat @@ -247,15 +253,16 @@ class Empirical(Approximation): Parameters ---------- - trace : MultiTrace + trace : :class:`MultiTrace` local_rv : dict[var->tuple] Experimental for Empirical Approximation mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - scale_cost_to_minibatch : bool, default False - Scale cost to minibatch instead of full dataset - model : PyMC3 model + scale_cost_to_minibatch : `bool` + Scale cost to minibatch instead of full dataset, default False + model : :class:`Model` + PyMC3 model for inference seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one @@ -356,15 +363,18 @@ def from_noise(cls, size, jitter=.01, local_rv=None, Parameters ---------- - size : number of initial particles - jitter : initial sd - local_rv : dict + size : `int` + number of initial particles + jitter : `float` + initial sd + local_rv : `dict` mapping {model_variable -> local_variable} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - start : initial point - model : pm.Model - PyMC3 Model + start : `Point` + initial point + model : :class:`Model` + PyMC3 model for inference seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one @@ -372,7 +382,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None, Returns ------- - Empirical + :class:`Empirical` """ hist = cls(None, local_rv=local_rv, model=model, seed=seed, **kwargs) if start is None: @@ -394,15 +404,16 @@ def sample_approx(approx, draws=100, include_transformed=True): Parameters ---------- - approx : Approximation - draws : int + approx : :class:`Approximation` + Approximation to sample from + draws : `int` Number of random samples. - include_transformed : bool + include_transformed : `bool` If True, transformed variables are also sampled. Default is True. Returns ------- - trace : pymc3.backends.base.MultiTrace + trace : class:`pymc3.backends.base.MultiTrace` Samples drawn from variational posterior. """ if not isinstance(approx, Approximation): diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py index d716b84597..64518a6dae 100644 --- a/pymc3/variational/inference.py +++ b/pymc3/variational/inference.py @@ -271,21 +271,28 @@ class ADVI(Inference): observed variables with different :code:`total_size` and iterate them independently during inference. - For working with ADVI, we need to give + For working with ADVI, we need to give + - The probabilistic model - (:code:`model`), the three types of RVs (:code:`observed_RVs`, + + :code:`model` with three types of RVs (:code:`observed_RVs`, :code:`global_RVs` and :code:`local_RVs`). - (optional) Minibatches + The tensors to which mini-bathced samples are supplied are handled separately by using callbacks in :code:`.fit` method that change storage of shared theano variable or by :code:`pm.generator` that automatically iterates over minibatches and defined beforehand. - (optional) Parameters of deterministic mappings + They have to be passed along with other params to :code:`.fit` method as :code:`more_obj_params` argument. + + See Also + -------- For more information concerning training stage please reference :code:`pymc3.variational.opvi.ObjectiveFunction.step_function` @@ -295,35 +302,34 @@ class ADVI(Inference): mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - - model : PyMC3 model for inference - - cost_part_grad_scale : float or scalar tensor + model : :class:`Model` + PyMC3 model for inference + cost_part_grad_scale : `scalar` Scaling score part of gradient can be useful near optimum for archiving better convergence properties. Common schedule is 1 at the start and 0 in the end. So slow decay will be ok. See (Sticking the Landing; Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016) for details - scale_cost_to_minibatch : bool, default False - Scale cost to minibatch instead of full dataset + scale_cost_to_minibatch : `bool` + Scale cost to minibatch instead of full dataset, default False seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one - start : Point + start : `Point` starting point for inference References ---------- - - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., + - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. - - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 + - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf - - Kingma, D. P., & Welling, M. (2014). - Auto-Encoding Variational Bayes. stat, 1050, 1. + - Kingma, D. P., & Welling, M. (2014). + Auto-Encoding Variational Bayes. stat, 1050, 1. """ def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, @@ -343,12 +349,12 @@ def from_mean_field(cls, mean_field): Parameters ---------- - mean_field : MeanField + mean_field : :class:`MeanField` approximation to start with Returns ------- - ADVI + :class:`ADVI` """ if not isinstance(mean_field, MeanField): raise TypeError('Expected MeanField, got %r' % mean_field) @@ -369,10 +375,9 @@ class FullRankADVI(Inference): mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - - model : PyMC3 model for inference - - cost_part_grad_scale : float or scalar tensor + model : :class:`Model` + PyMC3 model for inference + cost_part_grad_scale : `scalar` Scaling score part of gradient can be useful near optimum for archiving better convergence properties. Common schedule is 1 at the start and 0 in the end. So slow decay will be ok. @@ -383,21 +388,21 @@ class FullRankADVI(Inference): seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one - start : Point + start : `Point` starting point for inference References ---------- - - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., + - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. - - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 + - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf - - Kingma, D. P., & Welling, M. (2014). - Auto-Encoding Variational Bayes. stat, 1050, 1. + - Kingma, D. P., & Welling, M. (2014). + Auto-Encoding Variational Bayes. stat, 1050, 1. """ def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, @@ -417,12 +422,12 @@ def from_full_rank(cls, full_rank): Parameters ---------- - full_rank : FullRank + full_rank : :class:`FullRank` approximation to start with Returns ------- - FullRankADVI + :class:`FullRankADVI` """ if not isinstance(full_rank, FullRank): raise TypeError('Expected MeanField, got %r' % full_rank) @@ -439,17 +444,17 @@ def from_mean_field(cls, mean_field, gpu_compat=False): Parameters ---------- - mean_field : MeanField + mean_field : :class:`MeanField` approximation to start with - Flags - ----- - gpu_compat : bool + Other Parameters + ---------------- + gpu_compat : `bool` use GPU compatible version or not Returns ------- - FullRankADVI + :class:`FullRankADVI` """ full_rank = FullRank.from_mean_field(mean_field, gpu_compat) inference = object.__new__(cls) @@ -465,16 +470,16 @@ def from_advi(cls, advi, gpu_compat=False): Parameters ---------- - advi : ADVI + advi : :class:`ADVI` - Flags - ----- + Other Parameters + ---------------- gpu_compat : bool use GPU compatible version or not Returns ------- - FullRankADVI + :class:`FullRankADVI` """ inference = cls.from_mean_field(advi.approx, gpu_compat) inference.hist = advi.hist @@ -494,6 +499,7 @@ class SVGD(Inference): Input: A target distribution with density function :math:`p(x)` and a set of initial particles :math:`{x^0_i}^n_{i=1}` Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution. + .. math:: x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) @@ -501,28 +507,29 @@ class SVGD(Inference): Parameters ---------- - n_particles : int + n_particles : `int` number of particles to use for approximation - jitter : + jitter : `float` noise sd for initial point - model : pm.Model - kernel : callable + model : :class:`Model` + PyMC3 model for inference + kernel : `callable` kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.)) scale_cost_to_minibatch : bool, default False Scale cost to minibatch instead of full dataset - start : dict + start : `dict` initial point for inference - histogram : Empirical + histogram : :class:`Empirical` initialize SVGD with given Empirical approximation instead of default initial particles seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one - start : Point + start : `Point` starting point for inference References ---------- - - Qiang Liu, Dilin Wang (2016) + - Qiang Liu, Dilin Wang (2016) Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm arXiv:1608.04471 """ @@ -546,26 +553,31 @@ def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, start=None Parameters ---------- - n : int + n : `int` number of iterations local_rv : dict[var->tuple] mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - method : str or Inference + method : str or :class:`Inference` string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'} - model : Model - kwargs : kwargs for Inference.fit - frac : float + model : :class:`Model` + PyMC3 model for inference + + Other Parameters + ---------------- + frac : `float` if method is 'advi->fullrank_advi' represents advi fraction when training seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one - start : Point + start : `Point` starting point for inference + kwargs : kwargs for :method:`Inference.fit` + Returns ------- - Approximation + :class:`Approximation` """ if model is None: model = pm.modelcontext(model) diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py index 38a3726fa1..a4f8de9bdd 100644 --- a/pymc3/variational/operators.py +++ b/pymc3/variational/operators.py @@ -12,6 +12,7 @@ class KL(Operator): """ Operator based on Kullback Leibler Divergence + .. math:: KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv @@ -41,8 +42,9 @@ class KSD(Operator): Operator based on Kernelized Stein Discrepancy Input: A target distribution with density function :math:`p(x)` - and a set of initial particles :math:`{x^0_i}^n_{i=1}` - Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution. + and a set of initial particles :math:`\{x^0_i\}^n_{i=1}` + Output: A set of particles :math:`\{x_i\}^n_{i=1}` that approximates the target distribution. + .. math:: x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) @@ -50,11 +52,12 @@ class KSD(Operator): Parameters ---------- - approx : pm.Empirical + approx : :class:`pm.Empirical` + Empirical Approximation used for inference References ---------- - - Qiang Liu, Dilin Wang (2016) + - Qiang Liu, Dilin Wang (2016) Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm arXiv:1608.04471 """ diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py index f734c4d8d7..13dca544e5 100644 --- a/pymc3/variational/opvi.py +++ b/pymc3/variational/opvi.py @@ -5,7 +5,7 @@ reveal the true nature of underlying problem. In some applications it can yield unreliable decisions. -Recently on NIPS 2017 [OPVI](https://arxiv.org/abs/1610.09033) framework +Recently on NIPS 2017 `OPVI `_ framework was presented. It generalizes variational inverence so that the problem is build with blocks. The first and essential block is Model itself. Second is Approximation, in some cases :math:`log Q(D)` is not really needed. Necessity @@ -68,8 +68,10 @@ class ObjectiveFunction(object): Parameters ---------- - op : Operator - tf : TestFunction + op : :class:`Operator` + OPVI Functional operator + tf : :class:`TestFunction` + OPVI TestFunction """ def __init__(self, op, tf): self.op = op @@ -85,7 +87,7 @@ def random(self, size=None): Parameters ---------- - size : int + size : `int` number of samples from distribution Returns @@ -101,26 +103,26 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize Parameters ---------- - obj_n_mc : int + obj_n_mc : `int` Number of monte carlo samples used for approximation of objective gradients - tf_n_mc : int + tf_n_mc : `int` Number of monte carlo samples used for approximation of test function gradients obj_optimizer : function (loss, params) -> updates Optimizer that is used for objective params test_optimizer : function (loss, params) -> updates Optimizer that is used for test function params - more_obj_params : list + more_obj_params : `list` Add custom params for objective optimizer - more_tf_params : list + more_tf_params : `list` Add custom params for test function optimizer - more_updates : dict + more_updates : `dict` Add custom updates to resulting updates - more_replacements : dict + more_replacements : `dict` Apply custom replacements before calculating gradients Returns ------- - ObjectiveUpdates + :class:`ObjectiveUpdates` """ if more_obj_params is None: more_obj_params = [] @@ -182,36 +184,37 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None, """Step function that should be called on each optimization step. Generally it solves the following problem: + .. math:: \textbf{\lambda^{*}} = \inf_{\lambda} \sup_{\theta} t(\mathbb{E}_{\lambda}[(O^{p,q}f_{\theta})(z)]) Parameters ---------- - obj_n_mc : int + obj_n_mc : `int` Number of monte carlo samples used for approximation of objective gradients - tf_n_mc : int + tf_n_mc : `int` Number of monte carlo samples used for approximation of test function gradients obj_optimizer : function (loss, params) -> updates Optimizer that is used for objective params test_optimizer : function (loss, params) -> updates Optimizer that is used for test function params - more_obj_params : list + more_obj_params : `list` Add custom params for objective optimizer - more_tf_params : list + more_tf_params : `list` Add custom params for test function optimizer - more_updates : dict + more_updates : `dict` Add custom updates to resulting updates - score : bool + score : `bool` calculate loss on each step? Defaults to False for speed - fn_kwargs : dict + fn_kwargs : `dict` Add kwargs to theano.function (e.g. `{'profile': True}`) - more_replacements : dict + more_replacements : `dict` Apply custom replacements before calculating gradients Returns ------- - theano.function + `theano.function` """ if fn_kwargs is None: fn_kwargs = {} @@ -237,11 +240,11 @@ def score_function(self, sc_n_mc=None, more_replacements=None, fn_kwargs=None): Parameters ---------- - sc_n_mc : int + sc_n_mc : `int` number of scoring MC samples more_replacements: Apply custom replacements before compiling a function - fn_kwargs: + fn_kwargs: `dict` arbitrary kwargs passed to theano.function Returns @@ -278,10 +281,11 @@ class Operator(object): Parameters ---------- - approx : Approximation + approx : :class:`Approximation` + an approximation instance - Subclassing - ----------- + Notes + ----- For implementing Custom operator it is needed to define :code:`.apply(f)` method """ @@ -326,19 +330,21 @@ def logq_norm(self, z): def apply(self, f): # pragma: no cover """Operator itself + .. math:: (O^{p,q}f_{\theta})(z) Parameters ---------- - f : TestFunction or None if not required + f : :class:`TestFunction` or None if not required function that takes `z = self.input` and returns same dimensional output Returns ------- - symbolically applied operator + tt.TensorVariable + symbolically applied operator """ raise NotImplementedError @@ -426,7 +432,8 @@ def _setup(self, dim): Parameters ---------- - dim : int dimension of posterior distribution + dim : int + dimension of posterior distribution """ pass @@ -445,12 +452,11 @@ class Approximation(object): Parameters ---------- local_rv : dict[var->tuple] - mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)} + mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - - model : PyMC3 model for inference - + model : :class:`Model` + PyMC3 model for inference cost_part_grad_scale : float or scalar tensor Scaling score part of gradient can be useful near optimum for archiving better convergence properties. Common schedule is @@ -463,10 +469,11 @@ class Approximation(object): leave None to use package global RandomStream or other valid value to create instance specific one - Subclassing - ----------- + Notes + ----- Defining an approximation needs custom implementation of the following methods: + - :code:`.create_shared_params(**kwargs)` Returns {dict|list|theano.shared} @@ -481,19 +488,21 @@ class Approximation(object): Returns Scalar You can also override the following methods: + - :code:`._setup(**kwargs)` Do some specific stuff having :code:`kwargs` before calling :code:`.create_shared_params` - :code:`.check_model(model, **kwargs)` Do some specific check for model having :code:`kwargs` - Notes - ----- + See Also + -------- :code:`kwargs` mentioned above are supplied as additional arguments for :code:`Approximation.__init__` There are some defaults class attributes for approximation classes that can be optionally overriden. + - :code:`initial_dist_name` string that represents name of the initial distribution. In most cases if will be `uniform` or `normal` @@ -553,7 +562,7 @@ def seed(self, seed=None): Parameters ---------- - seed : int + seed : `int` """ self._seed = seed self._rng.seed(seed) @@ -609,16 +618,16 @@ def construct_replacements(self, include=None, exclude=None, Parameters ---------- - include : list + include : `list` latent variables to be replaced - exclude : list + exclude : `list` latent variables to be excluded for replacements - more_replacements : dict + more_replacements : `dict` add custom replacements to graph, e.g. change input source Returns ------- - dict + `dict` Replacements """ if include is not None and exclude is not None: @@ -647,11 +656,11 @@ def apply_replacements(self, node, deterministic=False, deterministic : bool whether to use zeros as initial distribution if True - zero initial point will produce constant latent variables - include : list + include : `list` latent variables to be replaced - exclude : list + exclude : `list` latent variables to be excluded for replacements - more_replacements : dict + more_replacements : `dict` add custom replacements to graph, e.g. change input source Returns @@ -674,7 +683,7 @@ def sample_node(self, node, size=100, node : Theano Variables (or Theano expressions) size : scalar number of samples - more_replacements : dict + more_replacements : `dict` add custom replacements to graph, e.g. change input source Returns @@ -716,13 +725,16 @@ def initial(self, size, no_rand=False, l=None): Parameters ---------- - size : int - number of samples - no_rand : bool - return zeros if True - l : length of sample, defaults to latent space dim + size : `int` + number of samples + no_rand : `bool` + return zeros if True + l : `int` + length of sample, defaults to latent space dim Returns ------- - Tensor + `tt.TensorVariable` sampled latent space shape == size + latent_dim """ @@ -754,8 +766,10 @@ def random_local(self, size=None, no_rand=False): Parameters ---------- - size : number of samples from distribution - no_rand : whether use deterministic distribution + size : `scalar` + number of samples from distribution + no_rand : `bool` + whether use deterministic distribution Returns ------- @@ -771,8 +785,10 @@ def random_global(self, size=None, no_rand=False): # pragma: no cover Parameters ---------- - size : number of samples from distribution - no_rand : whether use deterministic distribution + size : `scalar` + number of samples from distribution + no_rand : `bool` + whether use deterministic distribution Returns ------- @@ -785,8 +801,10 @@ def random(self, size=None, no_rand=False): Parameters ---------- - size : number of samples from distribution - no_rand : whether use deterministic distribution + size : `scalar` + number of samples from distribution + no_rand : `bool` + whether use deterministic distribution Returns ------- @@ -816,8 +834,10 @@ def random_fn(self): Parameters ---------- - size : number of samples from distribution - no_rand : whether use deterministic distribution + size : `int` + number of samples from distribution + no_rand : `bool` + whether use deterministic distribution Returns ------- @@ -844,14 +864,14 @@ def sample(self, draws=1, include_transformed=False): Parameters ---------- - draws : int + draws : `int` Number of random samples. - include_transformed : bool + include_transformed : `bool` If True, transformed variables are also sampled. Default is False. Returns ------- - trace : pymc3.backends.base.MultiTrace + trace : :class:`pymc3.backends.base.MultiTrace` Samples drawn from variational posterior. """ vars_sampled = get_default_varnames(self.model.unobserved_RVs, @@ -910,15 +930,17 @@ def view(self, space, name, reshape=True): Parameters ---------- - space : space to take view of variable from - name : str + space : matrix or vector + space to take view of variable from + name : `str` name of variable - reshape : bool + reshape : `bool` whether to reshape variable from vectorized view Returns ------- - variable view + (reshaped) slice of matrix + variable view """ theano_is_here = isinstance(space, tt.TensorVariable) slc = self._view[name].slc