25
25
26
26
27
27
class Inference (object ):
28
- """
28
+ R """
29
29
Base class for Variational Inference
30
30
31
31
Communicates Operator, Approximation and Test Function to build Objective Function
@@ -41,8 +41,9 @@ class Inference(object):
41
41
See (AEVB; Kingma and Welling, 2014) for details
42
42
model : Model
43
43
PyMC3 Model
44
- kwargs : kwargs for Approximation
44
+ kwargs : kwargs for :class:` Approximation`
45
45
"""
46
+
46
47
def __init__ (self , op , approx , tf , local_rv = None , model = None , ** kwargs ):
47
48
self .hist = np .asarray (())
48
49
if isinstance (approx , type ) and issubclass (approx , Approximation ):
@@ -99,11 +100,11 @@ def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
99
100
number of iterations
100
101
score : bool
101
102
evaluate loss on each iteration or not
102
- callbacks : list[function : (Approximation, losses, i) -> any ]
103
+ callbacks : list[function : (Approximation, losses, i) -> None ]
103
104
calls provided functions after each iteration step
104
105
progressbar : bool
105
106
whether to show progressbar or not
106
- kwargs : kwargs for ObjectiveFunction.step_function
107
+ kwargs : kwargs for :func:` ObjectiveFunction.step_function`
107
108
108
109
Returns
109
110
-------
@@ -177,7 +178,7 @@ def _iterate_with_loss(self, n, step_func, progress, callbacks):
177
178
178
179
179
180
class ADVI (Inference ):
180
- """
181
+ R """
181
182
Automatic Differentiation Variational Inference (ADVI)
182
183
183
184
This class implements the meanfield ADVI, where the variational
@@ -195,7 +196,7 @@ class ADVI(Inference):
195
196
in the model.
196
197
197
198
The next ones are global random variables
198
- :math:`\Theta=\{\\ theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
199
+ :math:`\Theta=\{\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
199
200
the probabilities for all observed samples.
200
201
201
202
The last ones are local random variables
@@ -212,35 +213,35 @@ class ADVI(Inference):
212
213
These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
213
214
a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
214
215
each observation. Therefore these parameters are denoted as
215
- :math:`\\ xi(\mathbf{y}_{i}; \\ nu)`, where :math:`\ \ nu` is the parameters
216
- of :math:`\\ xi(\cdot)`. For example, :math:`\ \ xi(\cdot)` can be a
216
+ :math:`\xi(\mathbf{y}_{i}; \nu)`, where :math:`\nu` is the parameters
217
+ of :math:`\xi(\cdot)`. For example, :math:`\xi(\cdot)` can be a
217
218
multilayer perceptron or convolutional neural network.
218
219
219
- In addition to :math:`\\ xi(\cdot)`, we can also include deterministic
220
+ In addition to :math:`\xi(\cdot)`, we can also include deterministic
220
221
mappings for the likelihood of observations. We denote the parameters of
221
222
the deterministic mappings as :math:`\eta`. An example of such mappings is
222
223
the deconvolutional neural network used in the convolutional VAE example
223
224
in the PyMC3 notebook directory.
224
225
225
226
This function maximizes the evidence lower bound (ELBO)
226
- :math:`{\cal L}(\gamma, \\ nu, \eta)` defined as follows:
227
+ :math:`{\cal L}(\gamma, \nu, \eta)` defined as follows:
227
228
228
229
.. math::
229
230
230
- {\cal L}(\gamma,\\ nu,\eta) & =
231
+ {\cal L}(\gamma,\nu,\eta) & =
231
232
\mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
232
233
\sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
233
234
\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
234
- \\ right]\\ right] \\ \\ &
235
- - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\ right]
235
+ \right]\right] \\ &
236
+ - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right]
236
237
- \mathbf{c}_{l}\sum_{i=1}^{N}
237
- KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\ right],
238
+ KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\right],
238
239
239
240
where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence
240
241
241
242
.. math::
242
243
243
- KL[q(v)||p(v)] = \int q(v)\log\\ frac{q(v)}{p(v)}dv,
244
+ KL[q(v)||p(v)] = \int q(v)\log\frac{q(v)}{p(v)}dv,
244
245
245
246
:math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
246
247
More precisely, we can write each of the terms in ELBO as follows:
@@ -250,59 +251,56 @@ class ADVI(Inference):
250
251
\mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
251
252
\sum_{k=1}^{V_{o}}c_{o}^{k}
252
253
\log p(\mathbf{y}_{i}^{k}|
253
- {\\ rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\ \\
254
- \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\ right] & = &
254
+ {\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\
255
+ \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right] & = &
255
256
\sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
256
- q(\\ theta^{k})||p(\\ theta^{k}|{\\ rm pa(\\ theta^{k})})\\ right] \\ \\
257
- \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\ right] & = &
257
+ q(\theta^{k})||p(\theta^{k}|{\rm pa(\theta^{k})})\right] \\
258
+ \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\right] & = &
258
259
\sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
259
260
q(\mathbf{z}_{i}^{k})||
260
- p(\mathbf{z}_{i}^{k}|{\\ rm pa}(\mathbf{z}_{i}^{k}))\ \ right],
261
+ p(\mathbf{z}_{i}^{k}|{\rm pa}(\mathbf{z}_{i}^{k}))\right],
261
262
262
- where :math:`{\\ rm pa}(v)` denotes the set of parent variables of :math:`v`
263
+ where :math:`{\rm pa}(v)` denotes the set of parent variables of :math:`v`
263
264
in the directed acyclic graph of the model.
264
265
265
266
When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
266
267
set to :math:`N/M`, where :math:`M` is the number of observations in each
267
- mini-batch. This is done with supplying :code: `total_size` parameter to
268
+ mini-batch. This is done with supplying `total_size` parameter to
268
269
observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`).
269
270
In this case it is possible to automatically determine appropriate scaling for :math:`logp`
270
271
of observed nodes. Interesting to note that it is possible to have two independent
271
- observed variables with different :code: `total_size` and iterate them independently
272
+ observed variables with different `total_size` and iterate them independently
272
273
during inference.
273
274
274
275
For working with ADVI, we need to give
275
276
276
277
- The probabilistic model
277
278
278
- :code: `model` with three types of RVs (:code: `observed_RVs`,
279
- :code: `global_RVs` and :code: `local_RVs`).
279
+ `model` with three types of RVs (`observed_RVs`,
280
+ `global_RVs` and `local_RVs`).
280
281
281
282
- (optional) Minibatches
282
283
283
284
The tensors to which mini-bathced samples are supplied are
284
- handled separately by using callbacks in :code:` .fit` method
285
- that change storage of shared theano variable or by :code:`pm .generator`
285
+ handled separately by using callbacks in :func:`Inference .fit` method
286
+ that change storage of shared theano variable or by :func:`pymc3 .generator`
286
287
that automatically iterates over minibatches and defined beforehand.
287
288
288
289
- (optional) Parameters of deterministic mappings
289
290
290
- They have to be passed along with other params to :code:`.fit` method
291
- as :code:`more_obj_params` argument.
292
-
293
-
294
- See Also
295
- --------
291
+ They have to be passed along with other params to :func:`Inference.fit` method
292
+ as `more_obj_params` argument.
293
+
296
294
For more information concerning training stage please reference
297
- :code :`pymc3.variational.opvi.ObjectiveFunction.step_function`
295
+ :func :`pymc3.variational.opvi.ObjectiveFunction.step_function`
298
296
299
297
Parameters
300
298
----------
301
299
local_rv : dict[var->tuple]
302
- mapping {model_variable -> local_variable (:math:`\\ mu`, :math:`\ \ rho`)}
300
+ mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
303
301
Local Vars are used for Autoencoding Variational Bayes
304
302
See (AEVB; Kingma and Welling, 2014) for details
305
- model : :class:`Model`
303
+ model : :class:`pymc3. Model`
306
304
PyMC3 model for inference
307
305
cost_part_grad_scale : `scalar`
308
306
Scaling score part of gradient can be useful near optimum for
@@ -331,6 +329,7 @@ class ADVI(Inference):
331
329
- Kingma, D. P., & Welling, M. (2014).
332
330
Auto-Encoding Variational Bayes. stat, 1050, 1.
333
331
"""
332
+
334
333
def __init__ (self , local_rv = None , model = None ,
335
334
cost_part_grad_scale = 1 ,
336
335
scale_cost_to_minibatch = False ,
@@ -366,7 +365,7 @@ def from_mean_field(cls, mean_field):
366
365
367
366
368
367
class FullRankADVI (Inference ):
369
- """
368
+ R """
370
369
Full Rank Automatic Differentiation Variational Inference (ADVI)
371
370
372
371
Parameters
@@ -375,7 +374,7 @@ class FullRankADVI(Inference):
375
374
mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
376
375
Local Vars are used for Autoencoding Variational Bayes
377
376
See (AEVB; Kingma and Welling, 2014) for details
378
- model : :class:`Model`
377
+ model : :class:`pymc3. Model`
379
378
PyMC3 model for inference
380
379
cost_part_grad_scale : `scalar`
381
380
Scaling score part of gradient can be useful near optimum for
@@ -404,6 +403,7 @@ class FullRankADVI(Inference):
404
403
- Kingma, D. P., & Welling, M. (2014).
405
404
Auto-Encoding Variational Bayes. stat, 1050, 1.
406
405
"""
406
+
407
407
def __init__ (self , local_rv = None , model = None ,
408
408
cost_part_grad_scale = 1 ,
409
409
scale_cost_to_minibatch = False ,
@@ -487,22 +487,23 @@ def from_advi(cls, advi, gpu_compat=False):
487
487
488
488
489
489
class SVGD (Inference ):
490
- """
490
+ R """
491
491
Stein Variational Gradient Descent
492
492
493
493
This inference is based on Kernelized Stein Discrepancy
494
494
it's main idea is to move initial noisy particles so that
495
495
they fit target distribution best.
496
496
497
497
Algorithm is outlined below
498
+
499
+ *Input:* A target distribution with density function :math:`p(x)`
500
+ and a set of initial particles :math:`{x^0_i}^n_{i=1}`
498
501
499
- Input: A target distribution with density function :math:`p(x)`
500
- and a set of initial particles :math:`{x^0_i}^n_{i=1}`
501
- Output: A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
502
+ *Output:* A set of particles :math:`{x_i}^n_{i=1}` that approximates the target distribution.
502
503
503
504
.. math::
504
505
505
- x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l)
506
+ x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\
506
507
\hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
507
508
508
509
Parameters
@@ -511,10 +512,10 @@ class SVGD(Inference):
511
512
number of particles to use for approximation
512
513
jitter : `float`
513
514
noise sd for initial point
514
- model : :class:`Model`
515
+ model : :class:`pymc3. Model`
515
516
PyMC3 model for inference
516
517
kernel : `callable`
517
- kernel function for KSD f(histogram) -> (k(x,.), \n abla_x k(x,.))
518
+ kernel function for KSD :math:` f(histogram) -> (k(x,.), \nabla_x k(x,.))`
518
519
scale_cost_to_minibatch : bool, default False
519
520
Scale cost to minibatch instead of full dataset
520
521
start : `dict`
@@ -533,6 +534,7 @@ class SVGD(Inference):
533
534
Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
534
535
arXiv:1608.04471
535
536
"""
537
+
536
538
def __init__ (self , n_particles = 100 , jitter = .01 , model = None , kernel = test_functions .rbf ,
537
539
scale_cost_to_minibatch = False , start = None , histogram = None ,
538
540
random_seed = None , local_rv = None ):
@@ -548,20 +550,20 @@ def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_function
548
550
549
551
550
552
def fit (n = 10000 , local_rv = None , method = 'advi' , model = None , random_seed = None , start = None , ** kwargs ):
551
- """
553
+ R """
552
554
Handy shortcut for using inference methods in functional way
553
555
554
556
Parameters
555
557
----------
556
558
n : `int`
557
559
number of iterations
558
560
local_rv : dict[var->tuple]
559
- mapping {model_variable -> local_variable (:math:`\\ mu`, :math:`\ \ rho`)}
561
+ mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
560
562
Local Vars are used for Autoencoding Variational Bayes
561
563
See (AEVB; Kingma and Welling, 2014) for details
562
564
method : str or :class:`Inference`
563
565
string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd'}
564
- model : :class:`Model`
566
+ model : :class:`pymc3. Model`
565
567
PyMC3 model for inference
566
568
random_seed : None or int
567
569
leave None to use package global RandomStream or other
@@ -573,7 +575,7 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta
573
575
----------------
574
576
frac : `float`
575
577
if method is 'advi->fullrank_advi' represents advi fraction when training
576
- kwargs : kwargs for :method :`Inference.fit`
578
+ kwargs : kwargs for :func :`Inference.fit`
577
579
578
580
Returns
579
581
-------
0 commit comments