Skip to content

Commit bc69427

Browse files
authored
fix docs (#2181)
* fix docs * more cross references
1 parent 1c0f113 commit bc69427

File tree

5 files changed

+91
-84
lines changed

5 files changed

+91
-84
lines changed

pymc3/variational/approximations.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class MeanField(Approximation):
2828
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
2929
Local Vars are used for Autoencoding Variational Bayes
3030
See (AEVB; Kingma and Welling, 2014) for details
31-
model : :class:`Model`
31+
model : :class:`pymc3.Model`
3232
PyMC3 model for inference
3333
start : `Point`
3434
initial mean
@@ -40,7 +40,7 @@ class MeanField(Approximation):
4040
Yuhuai Wu, David Duvenaud, 2016) for details
4141
scale_cost_to_minibatch : `bool`
4242
Scale cost to minibatch instead of full dataset, default False
43-
random seed : None or int
43+
random_seed : None or int
4444
leave None to use package global RandomStream or other
4545
valid value to create instance specific one
4646
@@ -258,21 +258,22 @@ class Empirical(Approximation):
258258
Parameters
259259
----------
260260
trace : :class:`MultiTrace`
261+
Trace storing samples (e.g. from step methods)
261262
local_rv : dict[var->tuple]
262263
Experimental for Empirical Approximation
263264
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
264265
Local Vars are used for Autoencoding Variational Bayes
265266
See (AEVB; Kingma and Welling, 2014) for details
266267
scale_cost_to_minibatch : `bool`
267268
Scale cost to minibatch instead of full dataset, default False
268-
model : :class:`Model`
269+
model : :class:`pymc3.Model`
269270
PyMC3 model for inference
270271
random_seed : None or int
271272
leave None to use package global RandomStream or other
272273
valid value to create instance specific one
273274
274-
Usage
275-
-----
275+
Examples
276+
--------
276277
>>> with model:
277278
... step = NUTS()
278279
... trace = sample(1000, step=step)
@@ -377,9 +378,9 @@ def from_noise(cls, size, jitter=.01, local_rv=None,
377378
See (AEVB; Kingma and Welling, 2014) for details
378379
start : `Point`
379380
initial point
380-
model : :class:`Model`
381+
model : :class:`pymc3.Model`
381382
PyMC3 model for inference
382-
random_seed : None or int
383+
random_seed : None or `int`
383384
leave None to use package global RandomStream or other
384385
valid value to create instance specific one
385386
kwargs : other kwargs passed to init

pymc3/variational/inference.py

Lines changed: 51 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626

2727
class Inference(object):
28-
"""
28+
R"""
2929
Base class for Variational Inference
3030
3131
Communicates Operator, Approximation and Test Function to build Objective Function
@@ -41,8 +41,9 @@ class Inference(object):
4141
See (AEVB; Kingma and Welling, 2014) for details
4242
model : Model
4343
PyMC3 Model
44-
kwargs : kwargs for Approximation
44+
kwargs : kwargs for :class:`Approximation`
4545
"""
46+
4647
def __init__(self, op, approx, tf, local_rv=None, model=None, **kwargs):
4748
self.hist = np.asarray(())
4849
if isinstance(approx, type) and issubclass(approx, Approximation):
@@ -99,11 +100,11 @@ def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
99100
number of iterations
100101
score : bool
101102
evaluate loss on each iteration or not
102-
callbacks : list[function : (Approximation, losses, i) -> any]
103+
callbacks : list[function : (Approximation, losses, i) -> None]
103104
calls provided functions after each iteration step
104105
progressbar : bool
105106
whether to show progressbar or not
106-
kwargs : kwargs for ObjectiveFunction.step_function
107+
kwargs : kwargs for :func:`ObjectiveFunction.step_function`
107108
108109
Returns
109110
-------
@@ -177,7 +178,7 @@ def _iterate_with_loss(self, n, step_func, progress, callbacks):
177178

178179

179180
class ADVI(Inference):
180-
"""
181+
R"""
181182
Automatic Differentiation Variational Inference (ADVI)
182183
183184
This class implements the meanfield ADVI, where the variational
@@ -195,7 +196,7 @@ class ADVI(Inference):
195196
in the model.
196197
197198
The next ones are global random variables
198-
:math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
199+
:math:`\Theta=\{\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
199200
the probabilities for all observed samples.
200201
201202
The last ones are local random variables
@@ -212,35 +213,35 @@ class ADVI(Inference):
212213
These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
213214
a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
214215
each observation. Therefore these parameters are denoted as
215-
:math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
216-
of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
216+
:math:`\xi(\mathbf{y}_{i}; \nu)`, where :math:`\nu` is the parameters
217+
of :math:`\xi(\cdot)`. For example, :math:`\xi(\cdot)` can be a
217218
multilayer perceptron or convolutional neural network.
218219
219-
In addition to :math:`\\xi(\cdot)`, we can also include deterministic
220+
In addition to :math:`\xi(\cdot)`, we can also include deterministic
220221
mappings for the likelihood of observations. We denote the parameters of
221222
the deterministic mappings as :math:`\eta`. An example of such mappings is
222223
the deconvolutional neural network used in the convolutional VAE example
223224
in the PyMC3 notebook directory.
224225
225226
This function maximizes the evidence lower bound (ELBO)
226-
:math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:
227+
:math:`{\cal L}(\gamma, \nu, \eta)` defined as follows:
227228
228229
.. math::
229230
230-
{\cal L}(\gamma,\\nu,\eta) & =
231+
{\cal L}(\gamma,\nu,\eta) & =
231232
\mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
232233
\sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
233234
\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
234-
\\right]\\right] \\\\ &
235-
- \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
235+
\right]\right] \\ &
236+
- \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right]
236237
- \mathbf{c}_{l}\sum_{i=1}^{N}
237-
KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],
238+
KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\right],
238239
239240
where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence
240241
241242
.. math::
242243
243-
KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,
244+
KL[q(v)||p(v)] = \int q(v)\log\frac{q(v)}{p(v)}dv,
244245
245246
:math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
246247
More precisely, we can write each of the terms in ELBO as follows:
@@ -250,59 +251,56 @@ class ADVI(Inference):
250251
\mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
251252
\sum_{k=1}^{V_{o}}c_{o}^{k}
252253
\log p(\mathbf{y}_{i}^{k}|
253-
{\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
254-
\mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
254+
{\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\
255+
\mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right] & = &
255256
\sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
256-
q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
257-
\mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
257+
q(\theta^{k})||p(\theta^{k}|{\rm pa(\theta^{k})})\right] \\
258+
\mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\right] & = &
258259
\sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
259260
q(\mathbf{z}_{i}^{k})||
260-
p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],
261+
p(\mathbf{z}_{i}^{k}|{\rm pa}(\mathbf{z}_{i}^{k}))\right],
261262
262-
where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
263+
where :math:`{\rm pa}(v)` denotes the set of parent variables of :math:`v`
263264
in the directed acyclic graph of the model.
264265
265266
When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
266267
set to :math:`N/M`, where :math:`M` is the number of observations in each
267-
mini-batch. This is done with supplying :code:`total_size` parameter to
268+
mini-batch. This is done with supplying `total_size` parameter to
268269
observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`).
269270
In this case it is possible to automatically determine appropriate scaling for :math:`logp`
270271
of observed nodes. Interesting to note that it is possible to have two independent
271-
observed variables with different :code:`total_size` and iterate them independently
272+
observed variables with different `total_size` and iterate them independently
272273
during inference.
273274
274275
For working with ADVI, we need to give
275276
276277
- The probabilistic model
277278
278-
:code:`model` with three types of RVs (:code:`observed_RVs`,
279-
:code:`global_RVs` and :code:`local_RVs`).
279+
`model` with three types of RVs (`observed_RVs`,
280+
`global_RVs` and `local_RVs`).
280281
281282
- (optional) Minibatches
282283
283284
The tensors to which mini-bathced samples are supplied are
284-
handled separately by using callbacks in :code:`.fit` method
285-
that change storage of shared theano variable or by :code:`pm.generator`
285+
handled separately by using callbacks in :func:`Inference.fit` method
286+
that change storage of shared theano variable or by :func:`pymc3.generator`
286287
that automatically iterates over minibatches and defined beforehand.
287288
288289
- (optional) Parameters of deterministic mappings
289290
290-
They have to be passed along with other params to :code:`.fit` method
291-
as :code:`more_obj_params` argument.
292-
293-
294-
See Also
295-
--------
291+
They have to be passed along with other params to :func:`Inference.fit` method
292+
as `more_obj_params` argument.
293+
296294
For more information concerning training stage please reference
297-
:code:`pymc3.variational.opvi.ObjectiveFunction.step_function`
295+
:func:`pymc3.variational.opvi.ObjectiveFunction.step_function`
298296
299297
Parameters
300298
----------
301299
local_rv : dict[var->tuple]
302-
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
300+
mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
303301
Local Vars are used for Autoencoding Variational Bayes
304302
See (AEVB; Kingma and Welling, 2014) for details
305-
model : :class:`Model`
303+
model : :class:`pymc3.Model`
306304
PyMC3 model for inference
307305
cost_part_grad_scale : `scalar`
308306
Scaling score part of gradient can be useful near optimum for
@@ -331,6 +329,7 @@ class ADVI(Inference):
331329
- Kingma, D. P., & Welling, M. (2014).
332330
Auto-Encoding Variational Bayes. stat, 1050, 1.
333331
"""
332+
334333
def __init__(self, local_rv=None, model=None,
335334
cost_part_grad_scale=1,
336335
scale_cost_to_minibatch=False,
@@ -366,7 +365,7 @@ def from_mean_field(cls, mean_field):
366365

367366

368367
class FullRankADVI(Inference):
369-
"""
368+
R"""
370369
Full Rank Automatic Differentiation Variational Inference (ADVI)
371370
372371
Parameters
@@ -375,7 +374,7 @@ class FullRankADVI(Inference):
375374
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
376375
Local Vars are used for Autoencoding Variational Bayes
377376
See (AEVB; Kingma and Welling, 2014) for details
378-
model : :class:`Model`
377+
model : :class:`pymc3.Model`
379378
PyMC3 model for inference
380379
cost_part_grad_scale : `scalar`
381380
Scaling score part of gradient can be useful near optimum for
@@ -404,6 +403,7 @@ class FullRankADVI(Inference):
404403
- Kingma, D. P., & Welling, M. (2014).
405404
Auto-Encoding Variational Bayes. stat, 1050, 1.
406405
"""
406+
407407
def __init__(self, local_rv=None, model=None,
408408
cost_part_grad_scale=1,
409409
scale_cost_to_minibatch=False,
@@ -487,22 +487,23 @@ def from_advi(cls, advi, gpu_compat=False):
487487

488488

489489
class SVGD(Inference):
490-
"""
490+
R"""
491491
Stein Variational Gradient Descent
492492
493493
This inference is based on Kernelized Stein Discrepancy
494494
it's main idea is to move initial noisy particles so that
495495
they fit target distribution best.
496496
497497
Algorithm is outlined below
498+
499+
*Input:* A target distribution with density function :math:`p(x)`
500+
and a set of initial particles :math:`{x^0_i}^n_{i=1}`
498501
499-
Input: A target distribution with density function :math:`p(x)`
500-
and a set of initial particles :math:`{x^0_i}^n_{i=1}`
501-
Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
502+
*Output:* A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
502503
503504
.. math::
504505
505-
x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
506+
x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\
506507
\hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
507508
508509
Parameters
@@ -511,10 +512,10 @@ class SVGD(Inference):
511512
number of particles to use for approximation
512513
jitter : `float`
513514
noise sd for initial point
514-
model : :class:`Model`
515+
model : :class:`pymc3.Model`
515516
PyMC3 model for inference
516517
kernel : `callable`
517-
kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
518+
kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))`
518519
scale_cost_to_minibatch : bool, default False
519520
Scale cost to minibatch instead of full dataset
520521
start : `dict`
@@ -533,6 +534,7 @@ class SVGD(Inference):
533534
Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
534535
arXiv:1608.04471
535536
"""
537+
536538
def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_functions.rbf,
537539
scale_cost_to_minibatch=False, start=None, histogram=None,
538540
random_seed=None, local_rv=None):
@@ -548,20 +550,20 @@ def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_function
548550

549551

550552
def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, **kwargs):
551-
"""
553+
R"""
552554
Handy shortcut for using inference methods in functional way
553555
554556
Parameters
555557
----------
556558
n : `int`
557559
number of iterations
558560
local_rv : dict[var->tuple]
559-
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
561+
mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
560562
Local Vars are used for Autoencoding Variational Bayes
561563
See (AEVB; Kingma and Welling, 2014) for details
562564
method : str or :class:`Inference`
563565
string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd'}
564-
model : :class:`Model`
566+
model : :class:`pymc3.Model`
565567
PyMC3 model for inference
566568
random_seed : None or int
567569
leave None to use package global RandomStream or other
@@ -573,7 +575,7 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta
573575
----------------
574576
frac : `float`
575577
if method is 'advi->fullrank_advi' represents advi fraction when training
576-
kwargs : kwargs for :method:`Inference.fit`
578+
kwargs : kwargs for :func:`Inference.fit`
577579
578580
Returns
579581
-------

pymc3/variational/operators.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,22 @@ def __call__(self, z):
3838

3939

4040
class KSD(Operator):
41-
"""
41+
R"""
4242
Operator based on Kernelized Stein Discrepancy
4343
4444
Input: A target distribution with density function :math:`p(x)`
4545
and a set of initial particles :math:`\{x^0_i\}^n_{i=1}`
46+
4647
Output: A set of particles :math:`\{x_i\}^n_{i=1}` that approximates the target distribution.
4748
4849
.. math::
4950
50-
x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
51+
x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\
5152
\hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
5253
5354
Parameters
5455
----------
55-
approx : :class:`pm.Empirical`
56+
approx : :class:`Empirical`
5657
Empirical Approximation used for inference
5758
5859
References

0 commit comments

Comments
 (0)