From 7b5d5703dc976b088075f4391271b071ae60f6ae Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 12 Apr 2017 22:36:42 +0300
Subject: [PATCH 01/28] add sample_vp, deprecate old ADVI

---
 pymc3/variational/__init__.py       |  2 +-
 pymc3/variational/advi.py           |  3 +++
 pymc3/variational/advi_minibatch.py |  3 +++
 pymc3/variational/approximations.py | 39 ++++++++++++++++++++++++++++-
 pymc3/variational/opvi.py           |  2 +-
 5 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index 1df84767c1..233f36b6c9 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -1,4 +1,4 @@
-from .advi import advi, sample_vp
+from .advi import advi
 from .advi_minibatch import advi_minibatch
 
 from .updates import (
diff --git a/pymc3/variational/advi.py b/pymc3/variational/advi.py
index 759c877183..165431b311 100644
--- a/pymc3/variational/advi.py
+++ b/pymc3/variational/advi.py
@@ -108,6 +108,9 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         and Blei, D. M. (2016). Automatic Differentiation Variational
         Inference. arXiv preprint arXiv:1603.00788.
     """
+    import warnings
+    warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead',
+                  DeprecationWarning, stacklevel=2)
     model = pm.modelcontext(model)
     if start is None:
         start = model.test_point
diff --git a/pymc3/variational/advi_minibatch.py b/pymc3/variational/advi_minibatch.py
index fad8cf5561..b97e04a6c5 100644
--- a/pymc3/variational/advi_minibatch.py
+++ b/pymc3/variational/advi_minibatch.py
@@ -436,6 +436,9 @@ def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
       Weight Uncertainty in Neural Network. In Proceedings of the 32nd
       International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
     """
+    import warnings
+    warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead',
+                  DeprecationWarning, stacklevel=2)
     if encoder_params is None:
         encoder_params = []
 
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 957ff3fbbb..52ff70e427 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -12,7 +12,8 @@
 __all__ = [
     'MeanField',
     'FullRank',
-    'Histogram'
+    'Histogram',
+    'sample_vp'
 ]
 
 
@@ -353,3 +354,39 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
         x0 += np.random.normal(0, jitter, x0.shape)
         hist.histogram.set_value(x0)
         return hist
+
+
+def sample_vp(approx, draws=100, hide_transformed=False, **kwargs):
+    """
+    Draw samples from variational posterior.
+
+    Parameters
+    ----------
+    approx : Approximation
+    draws : int
+        Number of random samples.
+    hide_transformed : bool
+        If False, transformed variables are also sampled. Default is True.
+
+    Returns
+    -------
+    trace : pymc3.backends.base.MultiTrace
+        Samples drawn from variational posterior.
+    """
+    if approx.__class__.__name__ == 'ADVIFit':
+        import warnings
+        warnings.warn('Old ADVI interface is deprecated and be removed in future',
+                      DeprecationWarning, stacklevel=2)
+        _approx = approx
+        model = kwargs.get('model')
+        local_rv = kwargs.get('local_RVs')
+        approx = MeanField(model=model, local_rv=local_rv)
+        bij = DictToArrayBijection(approx.order, {})
+        means = bij.map(_approx.means)
+        stds = bij.map(_approx.stds)
+        rhos = np.log(np.exp(stds) - 1)
+        approx.mean.set_value(means.astype(approx.mean.dtype))
+        approx.rho.set_value(rhos.astype(approx.rho.dtype))
+    if not isinstance(approx, Approximation):
+        raise TypeError('Need Approximation instance, got %r' % approx)
+    return approx.sample_vp(draws=draws, hide_transformed=hide_transformed)
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 71b2e6dda9..24c5b182d1 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -811,7 +811,7 @@ def sample_vp(self, draws=1, hide_transformed=False):
         Returns
         -------
         trace : pymc3.backends.base.MultiTrace
-            Samples drawn from the variational posterior.
+            Samples drawn from variational posterior.
         """
         if hide_transformed:
             vars_sampled = [v_ for v_ in self.model.unobserved_RVs

From 31e524da47998b1fcbbafb62f1682612f8900935 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 12 Apr 2017 22:37:06 +0300
Subject: [PATCH 02/28] found typo

---
 pymc3/variational/approximations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 52ff70e427..133d4b3c65 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -56,7 +56,7 @@ def rho(self):
 
     @property
     def cov(self):
-        return tt.diag(rho2sd(self.rho))
+        return tt.diag(rho2sd(self.rho)**2)
 
     def create_shared_params(self):
         return {'mu': theano.shared(

From ea1cd8286070f5402ebbb0aa0e3088f142054a43 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 12 Apr 2017 22:38:27 +0300
Subject: [PATCH 03/28] fix docs

---
 pymc3/variational/approximations.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 133d4b3c65..8fe2bb087b 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -70,8 +70,6 @@ def create_shared_params(self):
     def log_q_W_global(self, z):
         """
         log_q_W samples over q for global vars
-        Gradient wrt mu, rho in density parametrization
-        is set to zero to lower variance of ELBO
         """
         mu = self.scale_grad(self.mean)
         rho = self.scale_grad(self.rho)
@@ -165,8 +163,6 @@ def create_shared_params(self):
     def log_q_W_global(self, z):
         """
         log_q_W samples over q for global vars
-        Gradient wrt mu, rho in density parametrization
-        is set to zero to lower variance of ELBO
         """
         mu = self.scale_grad(self.mean)
         L = self.scale_grad(self.L)

From f952cdd0f2375803416546dd014c4e5233197c70 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 12 Apr 2017 22:56:40 +0300
Subject: [PATCH 04/28] add sample_vp to __init__

---
 pymc3/variational/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index 233f36b6c9..09a341a0bf 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -25,7 +25,8 @@
 from .approximations import (
     Histogram,
     FullRank,
-    MeanField
+    MeanField,
+    sample_vp
 )
 
 from . import approximations

From b9d0a03af707037956e5f7d882fc05b6ced9d020 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 13 Apr 2017 19:26:52 +0300
Subject: [PATCH 05/28] typo

---
 pymc3/variational/advi.py           | 3 ++-
 pymc3/variational/approximations.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pymc3/variational/advi.py b/pymc3/variational/advi.py
index 165431b311..f2627dfa3c 100644
--- a/pymc3/variational/advi.py
+++ b/pymc3/variational/advi.py
@@ -109,7 +109,8 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         Inference. arXiv preprint arXiv:1603.00788.
     """
     import warnings
-    warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead',
+    warnings.warn('Old ADVI interface is deprecated and will '
+                  'be removed in future, use pm.ADVI instead',
                   DeprecationWarning, stacklevel=2)
     model = pm.modelcontext(model)
     if start is None:
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 8fe2bb087b..0ed17a5ed8 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -371,7 +371,7 @@ def sample_vp(approx, draws=100, hide_transformed=False, **kwargs):
     """
     if approx.__class__.__name__ == 'ADVIFit':
         import warnings
-        warnings.warn('Old ADVI interface is deprecated and be removed in future',
+        warnings.warn('Old ADVI interface is deprecated and will be removed in future',
                       DeprecationWarning, stacklevel=2)
         _approx = approx
         model = kwargs.get('model')

From 35afe54b6e46d99988ccf2d772da414939df2d69 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 13 Apr 2017 20:23:30 +0300
Subject: [PATCH 06/28] refactor init nuts, and needed stuff

---
 pymc3/sampling.py                   | 21 ++++++++------
 pymc3/variational/approximations.py | 33 +++++++++++++---------
 pymc3/variational/inference.py      | 44 +++++++++++++++++++++++++++++
 pymc3/variational/opvi.py           | 18 ++----------
 4 files changed, 79 insertions(+), 37 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index fbea31d41e..7b36fbebf7 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -24,6 +24,7 @@
 STEP_METHODS = (NUTS, HamiltonianMC, Metropolis, BinaryMetropolis,
                 BinaryGibbsMetropolis, Slice, CategoricalGibbsMetropolis)
 
+
 def assign_step_methods(model, step=None, methods=STEP_METHODS,
                         step_kwargs=None):
     """Assign model variables to appropriate step methods.
@@ -566,19 +567,21 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         init = init.lower()
 
     if init == 'advi':
-        v_params = pm.variational.advi(n=n_init, random_seed=random_seed,
-                                       progressbar=progressbar)
-        start = pm.variational.sample_vp(v_params, njobs, progressbar=False,
-                                         hide_transformed=False,
-                                         random_seed=random_seed)
+        approx = pm.fit(
+            n=n_init, method='advi', model=model
+        )  # type: pm.MeanField
+        start = approx.sample_vp(draws=njobs)
+        cov = approx.cov.eval()
         if njobs == 1:
             start = start[0]
-        cov = np.power(model.dict_to_array(v_params.stds), 2)
     elif init == 'advi_map':
         start = pm.find_MAP()
-        v_params = pm.variational.advi(n=n_init, start=start,
-                                       random_seed=random_seed)
-        cov = np.power(model.dict_to_array(v_params.stds), 2)
+        approx = pm.MeanField(model=model, start=start)
+        pm.fit(n=n_init, method=pm.ADVI.from_mean_field(approx))
+        start = approx.sample_vp(draws=n_init)
+        cov = approx.cov.eval()
+        if njobs == 1:
+            start = start[0]
     elif init == 'map':
         start = pm.find_MAP()
         cov = pm.find_hessian(point=start)
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 0ed17a5ed8..80accb97a6 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -33,6 +33,9 @@ class MeanField(Approximation):
 
     model : PyMC3 model for inference
 
+    start : Point
+        initial mean
+
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
@@ -58,9 +61,10 @@ def rho(self):
     def cov(self):
         return tt.diag(rho2sd(self.rho)**2)
 
-    def create_shared_params(self):
+    def create_shared_params(self, **kwargs):
+        start = self.gbij(kwargs.get('start', self.model.test_point))
         return {'mu': theano.shared(
-                    pm.floatX(self.input.tag.test_value[self.global_slc]),
+                    pm.floatX(start),
                     'mu'),
                 'rho': theano.shared(
                     np.zeros((self.global_size,), dtype=theano.config.floatX),
@@ -100,6 +104,9 @@ class FullRank(Approximation):
 
     model : PyMC3 model for inference
 
+    start : Point
+        initial mean
+
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
@@ -147,16 +154,15 @@ def tril_index_matrix(self):
         tril_index_matrix[np.tril_indices(n)[::-1]] = np.arange(num_tril_entries)
         return tril_index_matrix
 
-    def create_shared_params(self):
+    def create_shared_params(self, **kwargs):
+        start = self.gbij(kwargs.get('start', self.model.test_point))
         n = self.global_size
         L_tril = (
             np.eye(n)
             [np.tril_indices(n)]
             .astype(theano.config.floatX)
         )
-        return {'mu': theano.shared(
-                    self.input.tag.test_value[self.global_slc],
-                    'mu'),
+        return {'mu': theano.shared(pm.floatX(start), 'mu'),
                 'L_tril': theano.shared(L_tril, 'L_tril')
                 }
 
@@ -251,18 +257,14 @@ def check_model(self, model, **kwargs):
                          for var in model.free_RVs])):
             raise ValueError('trace has not all FreeRV')
 
-    def _setup(self, **kwargs):
-        self._histogram_order = ArrayOrdering(self.global_vars)
-        self._bij = DictToArrayBijection(self._histogram_order, dict())
-
     def create_shared_params(self, **kwargs):
         trace = kwargs.get('trace')
         if trace is None:
-            histogram = np.atleast_2d(self._bij.map(self.model.test_point))
+            histogram = np.atleast_2d(self.gbij.map(self.model.test_point))
         else:
             histogram = np.empty((len(trace), self.global_size))
             for i in range(len(trace)):
-                histogram[i] = self._bij.map(trace[i])
+                histogram[i] = self.gbij.map(trace[i])
         return theano.shared(pm.floatX(histogram), 'histogram')
 
     def randidx(self, size=None):
@@ -320,6 +322,11 @@ def mapping(z):
     def mean(self):
         return self.histogram.mean(0)
 
+    @property
+    def cov(self):
+        x = (self.histogram - self.mean)
+        return x.T.dot(x) / self.histogram.shape[0]
+
     @classmethod
     def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
         """
@@ -344,7 +351,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
         hist = cls(None, local_rv=local_rv, model=model)
         if start is None:
             start = hist.model.test_point
-        start = hist._bij.map(start)
+        start = hist.gbij.map(start)
         # Initialize particles
         x0 = np.tile(start, (size, 1))
         x0 += np.random.normal(0, jitter, x0.shape)
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index e0bb005de3..cf2ffb2fe6 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -202,6 +202,28 @@ def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1):
             KL, MeanField, None,
             local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale)
 
+    @classmethod
+    def from_mean_field(cls, mean_field):
+        """
+        Construct ADVI from MeanField approximation
+
+        Parameters
+        ----------
+        mean_field : MeanField
+            approximation to start with
+
+        Returns
+        -------
+        ADVI
+        """
+        if not isinstance(mean_field, MeanField):
+            raise TypeError('Expected MeanField, got %r' % mean_field)
+        inference = object.__new__(cls)
+        objective = KL(mean_field)(None)
+        inference.hist = np.asarray(())
+        inference.objective = objective
+        return inference
+
 
 class FullRankADVI(Inference):
     """
@@ -241,6 +263,28 @@ def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, gpu_compat
             KL, FullRank, None,
             local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale, gpu_compat=gpu_compat)
 
+    @classmethod
+    def from_full_rank(cls, full_rank):
+        """
+        Construct FullRankADVI from FullRank approximation
+
+        Parameters
+        ----------
+        full_rank : FullRank
+            approximation to start with
+
+        Returns
+        -------
+        FullRankADVI
+        """
+        if not isinstance(full_rank, FullRank):
+            raise TypeError('Expected MeanField, got %r' % full_rank)
+        inference = object.__new__(cls)
+        objective = KL(full_rank)(None)
+        inference.hist = np.asarray(())
+        inference.objective = objective
+        return inference
+
     @classmethod
     def from_mean_field(cls, mean_field, gpu_compat=False):
         """
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 24c5b182d1..ecf60a5ab5 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -6,7 +6,7 @@
 import pymc3 as pm
 from .updates import adam
 from ..distributions.dist_math import rho2sd, log_normal
-from ..model import modelcontext, ArrayOrdering
+from ..model import modelcontext, ArrayOrdering, DictToArrayBijection
 from ..theanof import tt_rng, memoize, change_flags, GradScale
 
 
@@ -487,6 +487,8 @@ def get_transformed(v):
         self.local_vars = self.get_local_vars(**kwargs)
         self.global_vars = self.get_global_vars(**kwargs)
         self.order = ArrayOrdering(self.local_vars + self.global_vars)
+        self.gbij = DictToArrayBijection(ArrayOrdering(self.global_vars), {})
+        self.lbij = DictToArrayBijection(ArrayOrdering(self.local_vars), {})
         self.flat_view = model.flatten(
             vars=self.local_vars + self.global_vars
         )
@@ -509,20 +511,6 @@ def get_global_vars(self, **kwargs):
     def get_local_vars(self, **kwargs):
         return [v for v in self.model.free_RVs if v in self.known]
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        # can be inferred from the rest parts
-        state.pop('flat_view')
-        state.pop('order')
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.order = ArrayOrdering(self.local_vars + self.global_vars)
-        self.flat_view = self.model.flatten(
-            vars=self.local_vars + self.global_vars
-        )
-
     _view = property(lambda self: self.flat_view.view)
     input = property(lambda self: self.flat_view.input)
 

From 5c4edd5fe10047ae647310c75b0b1c8389233b58 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 14 Apr 2017 01:51:55 +0300
Subject: [PATCH 07/28] refactor sampling

---
 pymc3/sampling.py                         |  6 ++--
 pymc3/tests/test_variational_inference.py |  4 ++-
 pymc3/variational/__init__.py             |  1 +
 pymc3/variational/approximations.py       |  4 +--
 pymc3/variational/callbacks.py            | 37 +++++++++++++++++++++++
 pymc3/variational/inference.py            | 11 ++++---
 6 files changed, 53 insertions(+), 10 deletions(-)
 create mode 100644 pymc3/variational/callbacks.py

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 7b36fbebf7..aeb426dcfb 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -568,7 +568,8 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
 
     if init == 'advi':
         approx = pm.fit(
-            n=n_init, method='advi', model=model
+            n=n_init, method='advi', model=model,
+            callbacks=[pm.callbacks.CheckLossConvergence()]
         )  # type: pm.MeanField
         start = approx.sample_vp(draws=njobs)
         cov = approx.cov.eval()
@@ -577,7 +578,8 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
     elif init == 'advi_map':
         start = pm.find_MAP()
         approx = pm.MeanField(model=model, start=start)
-        pm.fit(n=n_init, method=pm.ADVI.from_mean_field(approx))
+        pm.fit(n=n_init, method=pm.ADVI.from_mean_field(approx),
+               callbacks=[pm.callbacks.CheckLossConvergence()])
         start = approx.sample_vp(draws=n_init)
         cov = approx.cov.eval()
         if njobs == 1:
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index e7d1a3a996..c2f98f7b50 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -145,7 +145,9 @@ def test_optimizer_with_full_data(self):
                 Normal('x', mu=mu_, sd=sd, observed=data)
                 inf = self.inference()
                 inf.fit(10)
-                approx = inf.fit(self.NITER, obj_optimizer=self.optimizer)
+                approx = inf.fit(self.NITER,
+                                 obj_optimizer=self.optimizer,
+                                 callbacks=[pm.callbacks.CheckLossConvergence()])
                 trace = approx.sample_vp(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index 09a341a0bf..d2a58ec491 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -35,3 +35,4 @@
 from . import opvi
 from . import updates
 from . import inference
+from . import callbacks
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 80accb97a6..ca241061a7 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -62,7 +62,7 @@ def cov(self):
         return tt.diag(rho2sd(self.rho)**2)
 
     def create_shared_params(self, **kwargs):
-        start = self.gbij(kwargs.get('start', self.model.test_point))
+        start = self.gbij.map(kwargs.get('start', self.model.test_point))
         return {'mu': theano.shared(
                     pm.floatX(start),
                     'mu'),
@@ -155,7 +155,7 @@ def tril_index_matrix(self):
         return tril_index_matrix
 
     def create_shared_params(self, **kwargs):
-        start = self.gbij(kwargs.get('start', self.model.test_point))
+        start = self.gbij.map(kwargs.get('start', self.model.test_point))
         n = self.global_size
         L_tril = (
             np.eye(n)
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
new file mode 100644
index 0000000000..f33379aa41
--- /dev/null
+++ b/pymc3/variational/callbacks.py
@@ -0,0 +1,37 @@
+import scipy.stats as stats
+
+
+class Callback(object):
+    def __call__(self, approx, loss, i):
+        raise NotImplementedError
+
+
+class CheckLossConvergence(Callback):
+    def __init__(self, every=100, window_size=1000, tolerance=1e-3):
+        """
+
+        Parameters
+        ----------
+        every : int
+            how often check convergence
+        window_size :
+            last elbos to take
+        tolerance : float
+            Error rate under null hypothesis, consider taking small values
+        """
+        self.every = every
+        self.window_size = window_size
+        self.critical = tolerance / 2.
+
+    def __call__(self, approx, hist, i):
+        if hist is None or i < self.window_size or i % self.every:
+            return
+        diff = hist[-self.window_size:] - hist[-self.window_size-1:-1]
+        mean = diff.mean()
+        # unbiased std of mean
+        std = diff.std() / (self.window_size - 1)
+        t = abs(mean / std)
+        p = stats.t.cdf(t, df=self.window_size) - .5
+        # 1 - confidence is lower allowed p
+        if p < self.critical:
+            raise StopIteration
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index cf2ffb2fe6..929998cbb3 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -88,7 +88,7 @@ def run_profiling(self, n=1000, score=None, **kwargs):
             progress.close()
         return step_func.profile
 
-    def fit(self, n=10000, score=None, callbacks=None, callback_every=1,
+    def fit(self, n=10000, score=None, callbacks=None,
             **kwargs):
         """
         Performs Operator Variational Inference
@@ -129,9 +129,8 @@ def fit(self, n=10000, score=None, callbacks=None, callback_every=1,
                     if i % 10 == 0:
                         avg_loss = scores[max(0, i - 1000):i+1].mean()
                         progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
-                    if i % callback_every == 0:
-                        for callback in callbacks:
-                            callback(self.approx, scores[:i+1], i)
+                    for callback in callbacks:
+                        callback(self.approx, scores[:i+1], i)
             except (KeyboardInterrupt, StopIteration):   # pragma: no cover
                 # do not print log on the same line
                 progress.close()
@@ -156,7 +155,9 @@ def fit(self, n=10000, score=None, callbacks=None, callback_every=1,
             try:
                 for _ in progress:
                     step_func()
-            except KeyboardInterrupt:
+                    for callback in callbacks:
+                        callback(self.approx, None, i)
+            except (KeyboardInterrupt, StopIteration):
                 pass
             finally:
                 progress.close()

From 5e12767c086c062b60b8a9ed8cb654e843d4d9f7 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 14 Apr 2017 02:11:30 +0300
Subject: [PATCH 08/28] typo

---
 pymc3/variational/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index f33379aa41..16683b959a 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -1,4 +1,5 @@
 import scipy.stats as stats
+import numpy as np
 
 
 class Callback(object):
@@ -26,10 +27,11 @@ def __init__(self, every=100, window_size=1000, tolerance=1e-3):
     def __call__(self, approx, hist, i):
         if hist is None or i < self.window_size or i % self.every:
             return
-        diff = hist[-self.window_size:] - hist[-self.window_size-1:-1]
+        diff = ((hist[-self.window_size:] - hist[-self.window_size-1:-1])
+                / hist[-self.window_size-1:-1])
         mean = diff.mean()
         # unbiased std of mean
-        std = diff.std() / (self.window_size - 1)
+        std = diff.std() / (self.window_size - 1)**.5
         t = abs(mean / std)
         p = stats.t.cdf(t, df=self.window_size) - .5
         # 1 - confidence is lower allowed p

From 2b4442d3d1bbeb07e6ae7ed39c3b4b1280391925 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 14 Apr 2017 10:52:22 +0300
Subject: [PATCH 09/28] unused import

---
 pymc3/variational/approximations.py | 2 +-
 pymc3/variational/callbacks.py      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index ca241061a7..f632e5cbc9 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -3,7 +3,7 @@
 from theano import tensor as tt
 
 import pymc3 as pm
-from pymc3 import ArrayOrdering, DictToArrayBijection
+from pymc3 import DictToArrayBijection
 from pymc3.distributions.dist_math import rho2sd, log_normal, log_normal_mv
 from pymc3.variational.opvi import Approximation
 from pymc3.theanof import tt_rng, memoize
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index 16683b959a..fb5c61c402 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -1,5 +1,4 @@
 import scipy.stats as stats
-import numpy as np
 
 
 class Callback(object):

From ac715bf8b0c36abbc357e4d5e0a439aa37ab1545 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 14 Apr 2017 22:13:29 +0300
Subject: [PATCH 10/28] rename sample_approx

---
 pymc3/sampling.py                         |  4 ++--
 pymc3/tests/test_variational_inference.py | 22 +++++++++++-----------
 pymc3/variational/__init__.py             |  4 ++--
 pymc3/variational/advi.py                 |  8 ++++++--
 pymc3/variational/approximations.py       | 18 ++----------------
 pymc3/variational/opvi.py                 |  2 +-
 6 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index aeb426dcfb..a0f3530d9b 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -571,7 +571,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
             n=n_init, method='advi', model=model,
             callbacks=[pm.callbacks.CheckLossConvergence()]
         )  # type: pm.MeanField
-        start = approx.sample_vp(draws=njobs)
+        start = approx.sample(draws=njobs)
         cov = approx.cov.eval()
         if njobs == 1:
             start = start[0]
@@ -580,7 +580,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         approx = pm.MeanField(model=model, start=start)
         pm.fit(n=n_init, method=pm.ADVI.from_mean_field(approx),
                callbacks=[pm.callbacks.CheckLossConvergence()])
-        start = approx.sample_vp(draws=n_init)
+        start = approx.sample(draws=n_init)
         cov = approx.cov.eval()
         if njobs == 1:
             start = start[0]
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index c2f98f7b50..b513437505 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -59,7 +59,7 @@ def _test_aevb(self):
     with model:
         inference = self.inference(local_rv={x: (mu, rho)})
         approx = inference.fit(3, obj_n_mc=2, obj_optimizer=self.optimizer)
-        approx.sample_vp(10)
+        approx.sample(10)
         approx.apply_replacements(
             y,
             more_replacements={x: np.asarray([1, 1], dtype=x.dtype)}
@@ -105,17 +105,17 @@ def test_vars_view_dynamic_size_numpy(self):
             x_sampled = app.view(app.random_fn(), 'x')
             assert x_sampled.shape == () + model['x'].dshape
 
-        def test_sample_vp(self):
+        def test_sample(self):
             n_samples = 100
             xs = np.random.binomial(n=1, p=0.2, size=n_samples)
             with pm.Model():
                 p = pm.Beta('p', alpha=1, beta=1)
                 pm.Binomial('xs', n=1, p=p, observed=xs)
                 app = self.inference().approx
-                trace = app.sample_vp(draws=1, hide_transformed=True)
+                trace = app.sample(draws=1, hide_transformed=True)
                 assert trace.varnames == ['p']
                 assert len(trace) == 1
-                trace = app.sample_vp(draws=10, hide_transformed=False)
+                trace = app.sample(draws=10, hide_transformed=False)
                 assert sorted(trace.varnames) == ['p', 'p_logodds_']
                 assert len(trace) == 10
 
@@ -148,7 +148,7 @@ def test_optimizer_with_full_data(self):
                 approx = inf.fit(self.NITER,
                                  obj_optimizer=self.optimizer,
                                  callbacks=[pm.callbacks.CheckLossConvergence()])
-                trace = approx.sample_vp(10000)
+                trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
 
@@ -175,7 +175,7 @@ def create_minibatch(data):
                 Normal('x', mu=mu_, sd=sd, observed=minibatches, total_size=n)
                 inf = self.inference()
                 approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer)
-                trace = approx.sample_vp(10000)
+                trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
 
@@ -206,7 +206,7 @@ def cb(*_):
                 Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n)
                 inf = self.inference()
                 approx = inf.fit(self.NITER * 3, callbacks=[cb], obj_n_mc=10, obj_optimizer=self.optimizer)
-                trace = approx.sample_vp(10000)
+                trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
 
@@ -281,9 +281,9 @@ def test_sampling(self):
         with models.multidimensional_model()[1]:
             full_rank = FullRankADVI()
             approx = full_rank.fit(20)
-            trace0 = approx.sample_vp(10000)
+            trace0 = approx.sample(10000)
             histogram = Histogram(trace0)
-        trace1 = histogram.sample_vp(100000)
+        trace1 = histogram.sample(100000)
         np.testing.assert_allclose(trace0['x'].mean(0), trace1['x'].mean(0), atol=0.01)
         np.testing.assert_allclose(trace0['x'].var(0), trace1['x'].var(0), atol=0.01)
 
@@ -295,9 +295,9 @@ def test_aevb_histogram(self):
         with model:
             inference = ADVI(local_rv={x: (mu, rho)})
             approx = inference.approx
-            trace0 = approx.sample_vp(10000)
+            trace0 = approx.sample(10000)
             histogram = Histogram(trace0, local_rv={x: (mu, rho)})
-            trace1 = histogram.sample_vp(10000)
+            trace1 = histogram.sample(10000)
             histogram.random(no_rand=True)
             histogram.random_fn(no_rand=True)
         np.testing.assert_allclose(trace0['y'].mean(0), trace1['y'].mean(0), atol=0.02)
diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index d2a58ec491..a61d63c8db 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -1,4 +1,4 @@
-from .advi import advi
+from .advi import advi, sample_vp
 from .advi_minibatch import advi_minibatch
 
 from .updates import (
@@ -26,7 +26,7 @@
     Histogram,
     FullRank,
     MeanField,
-    sample_vp
+    sample_approx
 )
 
 from . import approximations
diff --git a/pymc3/variational/advi.py b/pymc3/variational/advi.py
index f2627dfa3c..fffdc1814c 100644
--- a/pymc3/variational/advi.py
+++ b/pymc3/variational/advi.py
@@ -109,8 +109,8 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         Inference. arXiv preprint arXiv:1603.00788.
     """
     import warnings
-    warnings.warn('Old ADVI interface is deprecated and will '
-                  'be removed in future, use pm.ADVI instead',
+    warnings.warn('Old ADVI interface and sample_vp is deprecated and will '
+                  'be removed in future, use pm.fit and pm.sample_approx instead',
                   DeprecationWarning, stacklevel=2)
     model = pm.modelcontext(model)
     if start is None:
@@ -361,6 +361,10 @@ def sample_vp(
     trace : pymc3.backends.base.MultiTrace
         Samples drawn from the variational posterior.
     """
+    import warnings
+    warnings.warn('Old ADVI interface and sample_vp is deprecated and will '
+                  'be removed in future, use pm.fit and pm.sample_approx instead',
+                  DeprecationWarning, stacklevel=2)
     model = pm.modelcontext(model)
 
     if isinstance(vparams, ADVIFit):
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index f632e5cbc9..22129e643d 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -359,7 +359,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
         return hist
 
 
-def sample_vp(approx, draws=100, hide_transformed=False, **kwargs):
+def sample_approx(approx, draws=100, hide_transformed=False):
     """
     Draw samples from variational posterior.
 
@@ -376,20 +376,6 @@ def sample_vp(approx, draws=100, hide_transformed=False, **kwargs):
     trace : pymc3.backends.base.MultiTrace
         Samples drawn from variational posterior.
     """
-    if approx.__class__.__name__ == 'ADVIFit':
-        import warnings
-        warnings.warn('Old ADVI interface is deprecated and will be removed in future',
-                      DeprecationWarning, stacklevel=2)
-        _approx = approx
-        model = kwargs.get('model')
-        local_rv = kwargs.get('local_RVs')
-        approx = MeanField(model=model, local_rv=local_rv)
-        bij = DictToArrayBijection(approx.order, {})
-        means = bij.map(_approx.means)
-        stds = bij.map(_approx.stds)
-        rhos = np.log(np.exp(stds) - 1)
-        approx.mean.set_value(means.astype(approx.mean.dtype))
-        approx.rho.set_value(rhos.astype(approx.rho.dtype))
     if not isinstance(approx, Approximation):
         raise TypeError('Need Approximation instance, got %r' % approx)
-    return approx.sample_vp(draws=draws, hide_transformed=hide_transformed)
+    return approx.sample(draws=draws, hide_transformed=hide_transformed)
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index ecf60a5ab5..2fe70357d2 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -785,7 +785,7 @@ def inner(size=None, no_rand=False):
 
         return inner
 
-    def sample_vp(self, draws=1, hide_transformed=False):
+    def sample(self, draws=1, hide_transformed=False):
         """
         Draw samples from variational posterior.
 

From 67fe0719d3edee676d355b76314af2a1b64a07f3 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 14 Apr 2017 22:23:19 +0300
Subject: [PATCH 11/28] unused import

---
 pymc3/variational/approximations.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 22129e643d..eddc15b44f 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -3,7 +3,6 @@
 from theano import tensor as tt
 
 import pymc3 as pm
-from pymc3 import DictToArrayBijection
 from pymc3.distributions.dist_math import rho2sd, log_normal, log_normal_mv
 from pymc3.variational.opvi import Approximation
 from pymc3.theanof import tt_rng, memoize

From e9ce5998c95275572325d792f0ff4f543b7900cc Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Sat, 15 Apr 2017 00:12:28 +0300
Subject: [PATCH 12/28] allow seed kwarg

---
 pymc3/theanof.py                    | 15 ++++++++++++---
 pymc3/variational/approximations.py |  4 ++--
 pymc3/variational/opvi.py           | 23 ++++++++++++++++++++---
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/pymc3/theanof.py b/pymc3/theanof.py
index 3d4c9fdd08..10c5efcce6 100644
--- a/pymc3/theanof.py
+++ b/pymc3/theanof.py
@@ -372,9 +372,15 @@ def launch_rng(rng):
 launch_rng(_tt_rng)
 
 
-def tt_rng():
+def tt_rng(seed=None):
     """
-    Get the package-level random number generator.
+    Get the package-level random number generator or new with specified seed.
+
+    Parameters
+    ----------
+    seed : int
+        If not None
+        returns *new* theano random generator without replacing package global one
 
     Returns
     -------
@@ -382,7 +388,10 @@ def tt_rng():
         `theano.sandbox.rng_mrg.MRG_RandomStreams`
         instance passed to the most recent call of `set_tt_rng`
     """
-    return _tt_rng
+    if seed is None:
+        return _tt_rng
+    else:
+        return MRG_RandomStreams(seed)
 
 
 def set_tt_rng(new_rng):
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index eddc15b44f..853e68c3d0 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -5,7 +5,7 @@
 import pymc3 as pm
 from pymc3.distributions.dist_math import rho2sd, log_normal, log_normal_mv
 from pymc3.variational.opvi import Approximation
-from pymc3.theanof import tt_rng, memoize
+from pymc3.theanof import memoize
 
 
 __all__ = [
@@ -278,7 +278,7 @@ def randidx(self, size=None):
                 pass
         else:
             size = tuple(np.atleast_1d(size))
-        return (tt_rng()
+        return (self._rng
                 .uniform(size=size, low=0.0, high=self.histogram.shape[0] - 1e-16)
                 .astype('int64'))
 
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 2fe70357d2..28ff46b3a9 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -431,6 +431,10 @@ class Approximation(object):
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
 
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
+
     Subclassing
     -----------
     Defining an approximation needs
@@ -470,8 +474,10 @@ class Approximation(object):
     initial_dist_name = 'normal'
     initial_dist_map = 0.
 
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, **kwargs):
+    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, seed=None, **kwargs):
         model = modelcontext(model)
+        self._seed = seed
+        self._rng = tt_rng(seed)
         self.model = model
         self.check_model(model, **kwargs)
         if local_rv is None:
@@ -496,6 +502,17 @@ def get_transformed(v):
         self._setup(**kwargs)
         self.shared_params = self.create_shared_params(**kwargs)
 
+    def seed(self, seed=None):
+        """
+        Reinitialize RandomStream used by this approximation
+
+        Parameters
+        ----------
+        seed : int
+        """
+        self._seed = seed
+        self._rng.seed(seed)
+
     @property
     def normalizing_constant(self):
         t = self.to_flat_input(tt.max([v.scaling for v in self.model.basic_RVs]))
@@ -678,7 +695,7 @@ def initial(self, size, no_rand=False, l=None):
         shape = tt.stack(*shape)
         if theano_condition_is_here:
             no_rand = tt.as_tensor(no_rand)
-            sample = getattr(tt_rng(), self.initial_dist_name)(shape)
+            sample = getattr(self._rng, self.initial_dist_name)(shape)
             space = tt.switch(
                 no_rand,
                 tt.ones_like(sample) * self.initial_dist_map,
@@ -688,7 +705,7 @@ def initial(self, size, no_rand=False, l=None):
             if no_rand:
                 return tt.ones(shape) * self.initial_dist_map
             else:
-                return getattr(tt_rng(), self.initial_dist_name)(shape)
+                return getattr(self._rng, self.initial_dist_name)(shape)
         return space
 
     def random_local(self, size=None, no_rand=False):

From 21c16153ecd473a027df2af1e9a4fd3c71810e1a Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Sat, 15 Apr 2017 01:40:10 +0300
Subject: [PATCH 13/28] add docks, additional callbacks

---
 pymc3/sampling.py                         | 10 +++++--
 pymc3/tests/test_variational_inference.py | 11 ++++++--
 pymc3/variational/approximations.py       | 25 +++++++++++++----
 pymc3/variational/callbacks.py            | 32 +++++++++++++++++++--
 pymc3/variational/inference.py            | 34 ++++++++++++++++-------
 5 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index a0f3530d9b..785596ec7d 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -568,6 +568,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
 
     if init == 'advi':
         approx = pm.fit(
+            seed=random_seed,
             n=n_init, method='advi', model=model,
             callbacks=[pm.callbacks.CheckLossConvergence()]
         )  # type: pm.MeanField
@@ -578,9 +579,12 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
     elif init == 'advi_map':
         start = pm.find_MAP()
         approx = pm.MeanField(model=model, start=start)
-        pm.fit(n=n_init, method=pm.ADVI.from_mean_field(approx),
-               callbacks=[pm.callbacks.CheckLossConvergence()])
-        start = approx.sample(draws=n_init)
+        pm.fit(
+            seed=random_seed,
+            n=n_init, method=pm.ADVI.from_mean_field(approx),
+            callbacks=[pm.callbacks.CheckLossConvergence()]
+        )
+        start = approx.sample(draws=njobs)
         cov = approx.cov.eval()
         if njobs == 1:
             start = start[0]
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index b513437505..6d0434dae4 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -147,7 +147,8 @@ def test_optimizer_with_full_data(self):
                 inf.fit(10)
                 approx = inf.fit(self.NITER,
                                  obj_optimizer=self.optimizer,
-                                 callbacks=[pm.callbacks.CheckLossConvergence()])
+                                 callbacks=
+                                 [pm.callbacks.CheckLossConvergence1()])
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
@@ -174,7 +175,9 @@ def create_minibatch(data):
                 mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
                 Normal('x', mu=mu_, sd=sd, observed=minibatches, total_size=n)
                 inf = self.inference()
-                approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer)
+                approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer,
+                                 callbacks=
+                                 [pm.callbacks.CheckLossConvergence1()])
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
@@ -205,7 +208,9 @@ def cb(*_):
                 mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
                 Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n)
                 inf = self.inference()
-                approx = inf.fit(self.NITER * 3, callbacks=[cb], obj_n_mc=10, obj_optimizer=self.optimizer)
+                approx = inf.fit(self.NITER * 3, callbacks=
+                [cb, pm.callbacks.CheckLossConvergence1()],
+                                 obj_n_mc=10, obj_optimizer=self.optimizer)
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 853e68c3d0..35df6a88d3 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -42,6 +42,10 @@ class MeanField(Approximation):
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
 
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
+
     References
     ----------
     Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
@@ -113,16 +117,21 @@ class FullRank(Approximation):
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
 
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
+
     References
     ----------
     Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
     """
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, gpu_compat=False):
+    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, gpu_compat=False, seed=None):
         super(FullRank, self).__init__(
             local_rv=local_rv, model=model,
-            cost_part_grad_scale=cost_part_grad_scale
+            cost_part_grad_scale=cost_part_grad_scale,
+            seed=seed
         )
         self.gpu_compat = gpu_compat
 
@@ -239,6 +248,10 @@ class Histogram(Approximation):
 
     model : PyMC3 model
 
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
+
     Usage
     -----
     >>> with model:
@@ -246,8 +259,8 @@ class Histogram(Approximation):
     ...     trace = sample(1000, step=step)
     ...     histogram = Histogram(trace[100:])
     """
-    def __init__(self, trace, local_rv=None, model=None):
-        super(Histogram, self).__init__(local_rv=local_rv, model=model, trace=trace)
+    def __init__(self, trace, local_rv=None, model=None, seed=None):
+        super(Histogram, self).__init__(local_rv=local_rv, model=model, trace=trace, seed=seed)
 
     def check_model(self, model, **kwargs):
         trace = kwargs.get('trace')
@@ -327,7 +340,7 @@ def cov(self):
         return x.T.dot(x) / self.histogram.shape[0]
 
     @classmethod
-    def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
+    def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None, seed=None):
         """
         Initialize Histogram with random noise
 
@@ -347,7 +360,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None):
         -------
         Histogram
         """
-        hist = cls(None, local_rv=local_rv, model=model)
+        hist = cls(None, local_rv=local_rv, model=model, seed=seed)
         if start is None:
             start = hist.model.test_point
         start = hist.gbij.map(start)
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index fb5c61c402..967249d73d 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -1,4 +1,11 @@
 import scipy.stats as stats
+import numpy as np
+
+__all__ = [
+    'Callback',
+    'CheckLossConvergence1',
+    'CheckLossConvergence2'
+]
 
 
 class Callback(object):
@@ -6,8 +13,8 @@ def __call__(self, approx, loss, i):
         raise NotImplementedError
 
 
-class CheckLossConvergence(Callback):
-    def __init__(self, every=100, window_size=1000, tolerance=1e-3):
+class CheckLossConvergence1(Callback):
+    def __init__(self, every=100, window_size=2000, tolerance=1e-3):
         """
 
         Parameters
@@ -36,3 +43,24 @@ def __call__(self, approx, hist, i):
         # 1 - confidence is lower allowed p
         if p < self.critical:
             raise StopIteration
+
+
+class CheckLossConvergence2(Callback):
+    def __init__(self, every=100, tolerance=1e-2, steps=None):
+        self.steps = steps
+        self.every = every
+        self.tolerance = tolerance
+
+    def __call__(self, approx, hist, i):
+        if hist is None or i < self.every or i % self.every:
+            return
+        if self.steps is None:
+            window = int(max(0.1 * hist.size // self.every, 2.0))
+        else:
+            window = int(max(0.1 * self.steps // self.every, 2.0))
+        losses = hist[::self.every][-window:]
+        diff = np.abs((losses[1:]-losses[:-1])/losses[:-1])
+        mean = np.mean(diff)
+        med = np.median(diff)
+        if mean < self.tolerance or med < self.tolerance:
+            raise StopIteration
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 929998cbb3..85073a1de5 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -184,6 +184,9 @@ class ADVI(Inference):
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
 
     References
     ----------
@@ -198,10 +201,10 @@ class ADVI(Inference):
     - Kingma, D. P., & Welling, M. (2014).
       Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1):
+    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, seed=None):
         super(ADVI, self).__init__(
             KL, MeanField, None,
-            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale)
+            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale, seed=seed)
 
     @classmethod
     def from_mean_field(cls, mean_field):
@@ -246,6 +249,10 @@ class FullRankADVI(Inference):
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
 
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
+
     References
     ----------
     - Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
@@ -259,10 +266,11 @@ class FullRankADVI(Inference):
     - Kingma, D. P., & Welling, M. (2014).
       Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, gpu_compat=False):
+    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, gpu_compat=False, seed=None):
         super(FullRankADVI, self).__init__(
             KL, FullRank, None,
-            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale, gpu_compat=gpu_compat)
+            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale,
+            gpu_compat=gpu_compat, seed=seed)
 
     @classmethod
     def from_full_rank(cls, full_rank):
@@ -366,6 +374,9 @@ class SVGD(Inference):
         initial point for inference
     histogram : Histogram
         initialize SVGD with given Histogram instead of default initial particles
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
 
     References
     ----------
@@ -374,17 +385,17 @@ class SVGD(Inference):
         arXiv:1608.04471
     """
     def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_functions.rbf,
-                 start=None, histogram=None, local_rv=None):
+                 start=None, histogram=None, seed=None, local_rv=None):
         if histogram is None:
             histogram = Histogram.from_noise(
-                n_particles, jitter=jitter, start=start, model=model, local_rv=local_rv)
+                n_particles, jitter=jitter, start=start, model=model, local_rv=local_rv, seed=seed)
         super(SVGD, self).__init__(
             KSD, histogram,
             kernel,
-            model=model)
+            model=model, seed=seed)
 
 
-def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs):
+def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, **kwargs):
     """
     Handy shortcut for using inference methods in functional way
 
@@ -402,6 +413,9 @@ def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs):
     kwargs : kwargs for Inference.fit
     frac : float
         if method is 'advi->fullrank_advi' represents advi fraction when training
+    seed : None or int
+        leave None to use package global RandomStream or other
+        valid value to create instance specific one
 
     Returns
     -------
@@ -420,7 +434,7 @@ def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs):
             raise ValueError('frac should be in (0, 1)')
         n1 = int(n * frac)
         n2 = n-n1
-        inference = ADVI(local_rv=local_rv, model=model)
+        inference = ADVI(local_rv=local_rv, model=model, seed=seed)
         logger.info('fitting advi ...')
         inference.fit(n1, **kwargs)
         inference = FullRankADVI.from_advi(inference)
@@ -430,7 +444,7 @@ def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs):
     elif isinstance(method, str):
         try:
             inference = _select[method.lower()](
-                local_rv=local_rv, model=model
+                local_rv=local_rv, model=model, seed=seed
             )
         except KeyError:
             raise KeyError('method should be one of %s '

From d493caa1278c158b78aa02c8f23d4f56c311f975 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Sat, 15 Apr 2017 02:16:54 +0300
Subject: [PATCH 14/28] change callback

---
 pymc3/tests/test_variational_inference.py |  6 +-
 pymc3/variational/callbacks.py            | 67 ++++++-----------------
 2 files changed, 21 insertions(+), 52 deletions(-)

diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index 6d0434dae4..9e3e4cacb6 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -148,7 +148,7 @@ def test_optimizer_with_full_data(self):
                 approx = inf.fit(self.NITER,
                                  obj_optimizer=self.optimizer,
                                  callbacks=
-                                 [pm.callbacks.CheckLossConvergence1()])
+                                 [pm.callbacks.CheckParametersConvergence()])
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
@@ -177,7 +177,7 @@ def create_minibatch(data):
                 inf = self.inference()
                 approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer,
                                  callbacks=
-                                 [pm.callbacks.CheckLossConvergence1()])
+                                 [pm.callbacks.CheckParametersConvergence()])
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
             np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
@@ -209,7 +209,7 @@ def cb(*_):
                 Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n)
                 inf = self.inference()
                 approx = inf.fit(self.NITER * 3, callbacks=
-                [cb, pm.callbacks.CheckLossConvergence1()],
+                [cb, pm.callbacks.CheckParametersConvergence()],
                                  obj_n_mc=10, obj_optimizer=self.optimizer)
                 trace = approx.sample(10000)
             np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index 967249d73d..96e4d21fde 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -1,10 +1,8 @@
-import scipy.stats as stats
 import numpy as np
 
 __all__ = [
     'Callback',
-    'CheckLossConvergence1',
-    'CheckLossConvergence2'
+    'CheckParametersConvergence'
 ]
 
 
@@ -13,54 +11,25 @@ def __call__(self, approx, loss, i):
         raise NotImplementedError
 
 
-class CheckLossConvergence1(Callback):
-    def __init__(self, every=100, window_size=2000, tolerance=1e-3):
-        """
-
-        Parameters
-        ----------
-        every : int
-            how often check convergence
-        window_size :
-            last elbos to take
-        tolerance : float
-            Error rate under null hypothesis, consider taking small values
-        """
-        self.every = every
-        self.window_size = window_size
-        self.critical = tolerance / 2.
-
-    def __call__(self, approx, hist, i):
-        if hist is None or i < self.window_size or i % self.every:
-            return
-        diff = ((hist[-self.window_size:] - hist[-self.window_size-1:-1])
-                / hist[-self.window_size-1:-1])
-        mean = diff.mean()
-        # unbiased std of mean
-        std = diff.std() / (self.window_size - 1)**.5
-        t = abs(mean / std)
-        p = stats.t.cdf(t, df=self.window_size) - .5
-        # 1 - confidence is lower allowed p
-        if p < self.critical:
-            raise StopIteration
-
-
-class CheckLossConvergence2(Callback):
-    def __init__(self, every=100, tolerance=1e-2, steps=None):
-        self.steps = steps
+class CheckParametersConvergence(Callback):
+    def __init__(self, every=1000, tolerance=1e-2):
         self.every = every
+        self.prev = None
         self.tolerance = tolerance
 
-    def __call__(self, approx, hist, i):
-        if hist is None or i < self.every or i % self.every:
+    def __call__(self, approx, _, i):
+        if self.prev is None:
+            self.prev = self.flatten_shared(approx.params)
+        if i < self.every or i % self.every:
             return
-        if self.steps is None:
-            window = int(max(0.1 * hist.size // self.every, 2.0))
-        else:
-            window = int(max(0.1 * self.steps // self.every, 2.0))
-        losses = hist[::self.every][-window:]
-        diff = np.abs((losses[1:]-losses[:-1])/losses[:-1])
-        mean = np.mean(diff)
-        med = np.median(diff)
-        if mean < self.tolerance or med < self.tolerance:
+        current = self.flatten_shared(approx.params)
+        delta = (current - self.prev)/self.prev
+        self.prev = current
+        delta[np.isnan(delta)] = 0
+        norm = delta.dot(delta)**.5
+        if norm < self.tolerance:
             raise StopIteration
+
+    @staticmethod
+    def flatten_shared(shared_list):
+        return np.concatenate([sh.get_value().flatten() for sh in shared_list])

From d4fd91423b78dc67b019d96c2e985f74757d5fab Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Sat, 15 Apr 2017 11:34:50 +0300
Subject: [PATCH 15/28] fix typo

---
 pymc3/sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 785596ec7d..40bd0867d2 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -582,7 +582,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         pm.fit(
             seed=random_seed,
             n=n_init, method=pm.ADVI.from_mean_field(approx),
-            callbacks=[pm.callbacks.CheckLossConvergence()]
+            callbacks=[pm.callbacks.CheckParametersConvergence()]
         )
         start = approx.sample(draws=njobs)
         cov = approx.cov.eval()

From ce70f4a57baa2b229d6b6963af4d5e874c68714c Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Sat, 15 Apr 2017 11:35:22 +0300
Subject: [PATCH 16/28] check nan

---
 pymc3/variational/inference.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 85073a1de5..ebc000a099 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -153,8 +153,10 @@ def fit(self, n=10000, score=None, callbacks=None,
         else:   # pragma: no cover
             scores = np.asarray(())
             try:
-                for _ in progress:
+                for i in progress:
                     step_func()
+                    if np.isnan(self.approx.params[0].get_value()).any():
+                        raise FloatingPointError('NaN occurred in optimization.')
                     for callback in callbacks:
                         callback(self.approx, None, i)
             except (KeyboardInterrupt, StopIteration):

From 5e5756af29e4231f26fe08186c633a0c73fcd71b Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 19 Apr 2017 20:41:23 +0300
Subject: [PATCH 17/28] refactor callback

---
 pymc3/sampling.py              |  4 ++--
 pymc3/variational/callbacks.py | 12 +++++++-----
 pymc3/variational/inference.py |  9 ++++++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 40bd0867d2..2f08726027 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -570,7 +570,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         approx = pm.fit(
             seed=random_seed,
             n=n_init, method='advi', model=model,
-            callbacks=[pm.callbacks.CheckLossConvergence()]
+            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)]
         )  # type: pm.MeanField
         start = approx.sample(draws=njobs)
         cov = approx.cov.eval()
@@ -582,7 +582,7 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         pm.fit(
             seed=random_seed,
             n=n_init, method=pm.ADVI.from_mean_field(approx),
-            callbacks=[pm.callbacks.CheckParametersConvergence()]
+            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)]
         )
         start = approx.sample(draws=njobs)
         cov = approx.cov.eval()
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index 96e4d21fde..e48d55dfe0 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -12,10 +12,11 @@ def __call__(self, approx, loss, i):
 
 
 class CheckParametersConvergence(Callback):
-    def __init__(self, every=1000, tolerance=1e-2):
+    def __init__(self, every=1000, tolerance=1e-3, eps=1e-10):
         self.every = every
         self.prev = None
         self.tolerance = tolerance
+        self.eps = np.float32(eps)
 
     def __call__(self, approx, _, i):
         if self.prev is None:
@@ -23,12 +24,13 @@ def __call__(self, approx, _, i):
         if i < self.every or i % self.every:
             return
         current = self.flatten_shared(approx.params)
-        delta = (current - self.prev)/self.prev
+        prev = self.prev
+        eps = self.eps
+        delta = (np.abs(current - prev)+eps)/(np.abs(prev)+eps)
         self.prev = current
-        delta[np.isnan(delta)] = 0
-        norm = delta.dot(delta)**.5
+        norm = delta.max()
         if norm < self.tolerance:
-            raise StopIteration
+            raise StopIteration('Convergence archived')
 
     @staticmethod
     def flatten_shared(shared_list):
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index ebc000a099..308bc8948c 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -131,10 +131,12 @@ def fit(self, n=10000, score=None, callbacks=None,
                         progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
                     for callback in callbacks:
                         callback(self.approx, scores[:i+1], i)
-            except (KeyboardInterrupt, StopIteration):   # pragma: no cover
+            except (KeyboardInterrupt, StopIteration) as e:
                 # do not print log on the same line
                 progress.close()
                 scores = scores[:i]
+                if isinstance(e, StopIteration):
+                    logger.info(str(e))
                 if n < 10:
                     logger.info('Interrupted at {:,d} [{:.0f}%]: Loss = {:,.5g}'.format(
                         i, 100 * i // n, scores[i]))
@@ -159,8 +161,9 @@ def fit(self, n=10000, score=None, callbacks=None,
                         raise FloatingPointError('NaN occurred in optimization.')
                     for callback in callbacks:
                         callback(self.approx, None, i)
-            except (KeyboardInterrupt, StopIteration):
-                pass
+            except (KeyboardInterrupt, StopIteration) as e:
+                if isinstance(e, StopIteration):
+                    logger.info(str(e))
             finally:
                 progress.close()
         self.hist = np.concatenate([self.hist, scores])

From 9c61bf2535e3b8d1b6a38428c540b4166d84f18e Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Wed, 19 Apr 2017 23:57:23 +0300
Subject: [PATCH 18/28] fix pylint, good catch

---
 pymc3/variational/approximations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 35df6a88d3..5a6dce2963 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -12,7 +12,7 @@
     'MeanField',
     'FullRank',
     'Histogram',
-    'sample_vp'
+    'sample_approx'
 ]
 
 

From daa9ee0f779149fadf82eda6ba680a5211f04bdc Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 20 Apr 2017 00:06:12 +0300
Subject: [PATCH 19/28] Histogram -> Empirical (API change!)

Following discussion in #1953
CC @twiecki, @fonnesbeck, @aseyboldt, @jsalvatier, @taku-y, @springcoil
---
 pymc3/tests/test_variational_inference.py | 34 +++++++++++------------
 pymc3/variational/__init__.py             |  2 +-
 pymc3/variational/approximations.py       | 12 ++++----
 pymc3/variational/inference.py            |  8 +++---
 pymc3/variational/operators.py            |  6 ++--
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index 9e3e4cacb6..2bf647e609 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -7,7 +7,7 @@
 from pymc3 import Model, Normal
 from pymc3.variational import (
     ADVI, FullRankADVI, SVGD,
-    Histogram,
+    Empirical,
     fit
 )
 from pymc3.variational.operators import KL
@@ -281,18 +281,18 @@ class TestSVGD(TestApproximates.Base):
     optimizer = functools.partial(pm.adam, learning_rate=.1)
 
 
-class TestHistogram(SeededTest):
+class TestEmpirical(SeededTest):
     def test_sampling(self):
         with models.multidimensional_model()[1]:
             full_rank = FullRankADVI()
             approx = full_rank.fit(20)
             trace0 = approx.sample(10000)
-            histogram = Histogram(trace0)
-        trace1 = histogram.sample(100000)
+            approx = Empirical(trace0)
+        trace1 = approx.sample(100000)
         np.testing.assert_allclose(trace0['x'].mean(0), trace1['x'].mean(0), atol=0.01)
         np.testing.assert_allclose(trace0['x'].var(0), trace1['x'].var(0), atol=0.01)
 
-    def test_aevb_histogram(self):
+    def test_aevb_empirical(self):
         _, model, _ = models.exponential_beta(n=2)
         x = model.x
         mu = theano.shared(x.init_value)
@@ -301,10 +301,10 @@ def test_aevb_histogram(self):
             inference = ADVI(local_rv={x: (mu, rho)})
             approx = inference.approx
             trace0 = approx.sample(10000)
-            histogram = Histogram(trace0, local_rv={x: (mu, rho)})
-            trace1 = histogram.sample(10000)
-            histogram.random(no_rand=True)
-            histogram.random_fn(no_rand=True)
+            approx = Empirical(trace0, local_rv={x: (mu, rho)})
+            trace1 = approx.sample(10000)
+            approx.random(no_rand=True)
+            approx.random_fn(no_rand=True)
         np.testing.assert_allclose(trace0['y'].mean(0), trace1['y'].mean(0), atol=0.02)
         np.testing.assert_allclose(trace0['y'].var(0), trace1['y'].var(0), atol=0.02)
         np.testing.assert_allclose(trace0['x'].mean(0), trace1['x'].mean(0), atol=0.02)
@@ -317,17 +317,17 @@ def test_random_with_transformed(self):
             p = pm.Uniform('p')
             pm.Bernoulli('trials', p, observed=trials)
             trace = pm.sample(1000, step=pm.Metropolis())
-            histogram = Histogram(trace)
-            histogram.randidx(None).eval()
-            histogram.randidx(1).eval()
-            histogram.random_fn(no_rand=True)
-            histogram.random_fn(no_rand=False)
-            histogram.histogram_logp.eval()
+            approx = Empirical(trace)
+            approx.randidx(None).eval()
+            approx.randidx(1).eval()
+            approx.random_fn(no_rand=True)
+            approx.random_fn(no_rand=False)
+            approx.histogram_logp.eval()
 
     def test_init_from_noize(self):
         with models.multidimensional_model()[1]:
-            histogram = Histogram.from_noise(100)
-            assert histogram.histogram.eval().shape == (100, 6)
+            approx = Empirical.from_noise(100)
+            assert approx.histogram.eval().shape == (100, 6)
 
 _model = models.simple_model()[1]
 with _model:
diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index a61d63c8db..33d72c89fc 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -23,7 +23,7 @@
     fit,
 )
 from .approximations import (
-    Histogram,
+    Empirical,
     FullRank,
     MeanField,
     sample_approx
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 5a6dce2963..a96518e30c 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -11,7 +11,7 @@
 __all__ = [
     'MeanField',
     'FullRank',
-    'Histogram',
+    'Empirical',
     'sample_approx'
 ]
 
@@ -232,7 +232,7 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
         return full_rank
 
 
-class Histogram(Approximation):
+class Empirical(Approximation):
     """
     Builds Approximation instance from a given trace,
     it has the same interface as variational approximation
@@ -241,7 +241,7 @@ class Histogram(Approximation):
     ----------
     trace : MultiTrace
     local_rv : dict
-        Experimental for Histogram
+        Experimental for Histogram approximation
         mapping {model_variable -> local_variable}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
@@ -257,10 +257,10 @@ class Histogram(Approximation):
     >>> with model:
     ...     step = NUTS()
     ...     trace = sample(1000, step=step)
-    ...     histogram = Histogram(trace[100:])
+    ...     histogram = Empirical(trace[100:])
     """
     def __init__(self, trace, local_rv=None, model=None, seed=None):
-        super(Histogram, self).__init__(local_rv=local_rv, model=model, trace=trace, seed=seed)
+        super(Empirical, self).__init__(local_rv=local_rv, model=model, trace=trace, seed=seed)
 
     def check_model(self, model, **kwargs):
         trace = kwargs.get('trace')
@@ -358,7 +358,7 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None, see
 
         Returns
         -------
-        Histogram
+        Empirical
         """
         hist = cls(None, local_rv=local_rv, model=model, seed=seed)
         if start is None:
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 308bc8948c..bc37130983 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 import pymc3 as pm
-from pymc3.variational.approximations import MeanField, FullRank, Histogram
+from pymc3.variational.approximations import MeanField, FullRank, Empirical
 from pymc3.variational.operators import KL, KSD
 from pymc3.variational.opvi import Approximation
 from pymc3.variational import test_functions
@@ -377,8 +377,8 @@ class SVGD(Inference):
         kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
     start : dict
         initial point for inference
-    histogram : Histogram
-        initialize SVGD with given Histogram instead of default initial particles
+    histogram : Empirical
+        initialize SVGD with given Empirical approximation instead of default initial particles
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
@@ -392,7 +392,7 @@ class SVGD(Inference):
     def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_functions.rbf,
                  start=None, histogram=None, seed=None, local_rv=None):
         if histogram is None:
-            histogram = Histogram.from_noise(
+            histogram = Empirical.from_noise(
                 n_particles, jitter=jitter, start=start, model=model, local_rv=local_rv, seed=seed)
         super(SVGD, self).__init__(
             KSD, histogram,
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
index ceb4fe6633..38a3726fa1 100644
--- a/pymc3/variational/operators.py
+++ b/pymc3/variational/operators.py
@@ -50,7 +50,7 @@ class KSD(Operator):
 
     Parameters
     ----------
-    approx : pm.Histogram
+    approx : pm.Empirical
 
     References
     ----------
@@ -64,8 +64,8 @@ class KSD(Operator):
     OBJECTIVE = KSDObjective
 
     def __init__(self, approx):
-        if not isinstance(approx, pm.Histogram):
-            raise ValueError('approx should be a Histogram, got %r' % approx)
+        if not isinstance(approx, pm.Empirical):
+            raise ValueError('approx should be an Empirical approximation, got %r' % approx)
         Operator.__init__(self, approx)
 
     def apply(self, f):

From 8b2321871c9b8f19de7fd392ae388979b277118a Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 20 Apr 2017 00:35:54 +0300
Subject: [PATCH 20/28] refactor boilerplate Inference

---
 pymc3/variational/inference.py | 118 +++++++++++++++++----------------
 1 file changed, 62 insertions(+), 56 deletions(-)

diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index bc37130983..7766468d8f 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -88,7 +88,7 @@ def run_profiling(self, n=1000, score=None, **kwargs):
             progress.close()
         return step_func.profile
 
-    def fit(self, n=10000, score=None, callbacks=None,
+    def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
             **kwargs):
         """
         Performs Operator Variational Inference
@@ -100,9 +100,9 @@ def fit(self, n=10000, score=None, callbacks=None,
         score : bool
             evaluate loss on each iteration or not
         callbacks : list[function : (Approximation, losses, i) -> any]
-        callback_every : int
-            call callback functions on `callback_every` step, to
-            interrupt inference raise `StopIteration` exception inside callback
+            calls provided functions after each iteration step
+        progressbar : bool
+            whether to show progressbar or not
         kwargs : kwargs for ObjectiveFunction.step_function
 
         Returns
@@ -113,61 +113,67 @@ def fit(self, n=10000, score=None, callbacks=None,
             callbacks = []
         score = self._maybe_score(score)
         step_func = self.objective.step_function(score=score, **kwargs)
-        i = 0
-        progress = tqdm.trange(n)
+        progress = tqdm.trange(n, disable=not progressbar)
         if score:
-            scores = np.empty(n)
-            scores[:] = np.nan
-            try:
-                for i in progress:
-                    e = step_func()
-                    if np.isnan(e):     # pragma: no cover
-                        scores = scores[:i]
-                        self.hist = np.concatenate([self.hist, scores])
-                        raise FloatingPointError('NaN occurred in optimization.')
-                    scores[i] = e
-                    if i % 10 == 0:
-                        avg_loss = scores[max(0, i - 1000):i+1].mean()
-                        progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
-                    for callback in callbacks:
-                        callback(self.approx, scores[:i+1], i)
-            except (KeyboardInterrupt, StopIteration) as e:
-                # do not print log on the same line
-                progress.close()
-                scores = scores[:i]
-                if isinstance(e, StopIteration):
-                    logger.info(str(e))
-                if n < 10:
-                    logger.info('Interrupted at {:,d} [{:.0f}%]: Loss = {:,.5g}'.format(
-                        i, 100 * i // n, scores[i]))
-                else:
-                    avg_loss = scores[min(0, i - 1000):i+1].mean()
-                    logger.info('Interrupted at {:,d} [{:.0f}%]: Average Loss = {:,.5g}'.format(
-                        i, 100 * i // n, avg_loss))
+            self._iterate_with_loss(n, step_func, progress, callbacks)
+        else:
+            self._iterate_without_loss(n, step_func, progress, callbacks)
+        return self.approx
+
+    def _iterate_without_loss(self, _, step_func, progress, callbacks):
+        try:
+            for i in progress:
+                step_func()
+                if np.isnan(self.approx.params[0].get_value()).any():
+                    raise FloatingPointError('NaN occurred in optimization.')
+                for callback in callbacks:
+                    callback(self.approx, None, i)
+        except (KeyboardInterrupt, StopIteration) as e:
+            progress.close()
+            if isinstance(e, StopIteration):
+                logger.info(str(e))
+        finally:
+            progress.close()
+
+    def _iterate_with_loss(self, n, step_func, progress, callbacks):
+        scores = np.empty(n)
+        scores[:] = np.nan
+        i = 0
+        try:
+            for i in progress:
+                e = step_func()
+                if np.isnan(e):  # pragma: no cover
+                    scores = scores[:i]
+                    self.hist = np.concatenate([self.hist, scores])
+                    raise FloatingPointError('NaN occurred in optimization.')
+                scores[i] = e
+                if i % 10 == 0:
+                    avg_loss = scores[max(0, i - 1000):i + 1].mean()
+                    progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
+                for callback in callbacks:
+                    callback(self.approx, scores[:i + 1], i)
+        except (KeyboardInterrupt, StopIteration) as e:
+            # do not print log on the same line
+            progress.close()
+            scores = scores[:i]
+            if isinstance(e, StopIteration):
+                logger.info(str(e))
+            if n < 10:
+                logger.info('Interrupted at {:,d} [{:.0f}%]: Loss = {:,.5g}'.format(
+                    i, 100 * i // n, scores[i]))
             else:
-                if n < 10:
-                    logger.info('Finished [100%]: Loss = {:,.5g}'.format(scores[-1]))
-                else:
-                    avg_loss = scores[max(0, i - 1000):i+1].mean()
-                    logger.info('Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss))
-            finally:
-                progress.close()
-        else:   # pragma: no cover
-            scores = np.asarray(())
-            try:
-                for i in progress:
-                    step_func()
-                    if np.isnan(self.approx.params[0].get_value()).any():
-                        raise FloatingPointError('NaN occurred in optimization.')
-                    for callback in callbacks:
-                        callback(self.approx, None, i)
-            except (KeyboardInterrupt, StopIteration) as e:
-                if isinstance(e, StopIteration):
-                    logger.info(str(e))
-            finally:
-                progress.close()
+                avg_loss = scores[min(0, i - 1000):i + 1].mean()
+                logger.info('Interrupted at {:,d} [{:.0f}%]: Average Loss = {:,.5g}'.format(
+                    i, 100 * i // n, avg_loss))
+        else:
+            if n < 10:
+                logger.info('Finished [100%]: Loss = {:,.5g}'.format(scores[-1]))
+            else:
+                avg_loss = scores[max(0, i - 1000):i + 1].mean()
+                logger.info('Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss))
+        finally:
+            progress.close()
         self.hist = np.concatenate([self.hist, scores])
-        return self.approx
 
 
 class ADVI(Inference):

From 3e6311eb485c7ea7c554d7ceb9b64335e5782dda Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 20 Apr 2017 00:36:41 +0300
Subject: [PATCH 21/28] refactor callback

---
 pymc3/variational/callbacks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index e48d55dfe0..e3caa6013a 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -21,7 +21,7 @@ def __init__(self, every=1000, tolerance=1e-3, eps=1e-10):
     def __call__(self, approx, _, i):
         if self.prev is None:
             self.prev = self.flatten_shared(approx.params)
-        if i < self.every or i % self.every:
+        if i % self.every or i < self.every:
             return
         current = self.flatten_shared(approx.params)
         prev = self.prev
@@ -30,7 +30,7 @@ def __call__(self, approx, _, i):
         self.prev = current
         norm = delta.max()
         if norm < self.tolerance:
-            raise StopIteration('Convergence archived')
+            raise StopIteration('Convergence archived at %d' % i)
 
     @staticmethod
     def flatten_shared(shared_list):

From c7e0f405f35d1ad4ba90c0b2cf681240d7d42773 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 20 Apr 2017 00:50:39 +0300
Subject: [PATCH 22/28] add progressbar supprot for init_nuts

---
 pymc3/sampling.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 2f08726027..7a0e20f427 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -570,7 +570,8 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         approx = pm.fit(
             seed=random_seed,
             n=n_init, method='advi', model=model,
-            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)]
+            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)],
+            progressbar=progressbar
         )  # type: pm.MeanField
         start = approx.sample(draws=njobs)
         cov = approx.cov.eval()
@@ -582,7 +583,8 @@ def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
         pm.fit(
             seed=random_seed,
             n=n_init, method=pm.ADVI.from_mean_field(approx),
-            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)]
+            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-2)],
+            progressbar=progressbar
         )
         start = approx.sample(draws=njobs)
         cov = approx.cov.eval()

From 5e750e7626f5cb4433c1e3a80f7866e73a92426f Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Thu, 20 Apr 2017 22:45:00 +0300
Subject: [PATCH 23/28] launch tt_rng before return

---
 pymc3/theanof.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pymc3/theanof.py b/pymc3/theanof.py
index 10c5efcce6..a353895ae1 100644
--- a/pymc3/theanof.py
+++ b/pymc3/theanof.py
@@ -391,7 +391,9 @@ def tt_rng(seed=None):
     if seed is None:
         return _tt_rng
     else:
-        return MRG_RandomStreams(seed)
+        ret = MRG_RandomStreams(seed)
+        launch_rng(ret)
+        return ret
 
 
 def set_tt_rng(new_rng):

From f62fd25c06f059be25d191224d90dc1a3791be30 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 21 Apr 2017 00:00:28 +0300
Subject: [PATCH 24/28] Docs

---
 pymc3/variational/inference.py | 117 +++++++++++++++++++++++++++++++--
 pymc3/variational/opvi.py      |  71 ++++++++++++++++----
 2 files changed, 171 insertions(+), 17 deletions(-)

diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 7766468d8f..04f087306e 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -179,11 +179,120 @@ def _iterate_with_loss(self, n, step_func, progress, callbacks):
 class ADVI(Inference):
     """
     Automatic Differentiation Variational Inference (ADVI)
+    
+    This class implements the meanfield ADVI, where the variational
+    posterior distribution is assumed to be spherical Gaussian without
+    correlation of parameters and fit to the true posterior distribution.
+    The means and standard deviations of the variational posterior are referred
+    to as variational parameters.
+    
+    For explanation, we classify random variables in probabilistic models into
+    three types. Observed random variables
+    :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
+    Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
+    i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
+    :math:`V_{k}` is the number of the types of observed random variables
+    in the model.
+
+    The next ones are global random variables
+    :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
+    the probabilities for all observed samples.
+
+    The last ones are local random variables
+    :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
+    :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
+    These RVs are used only in AEVB.
+
+    The goal of ADVI is to approximate the posterior distribution
+    :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
+    :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
+    are normal distributions (mean-field approximation).
+
+    :math:`q(\Theta)` is parametrized with its means and standard deviations.
+    These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
+    a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
+    each observation. Therefore these parameters are denoted as
+    :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
+    of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
+    multilayer perceptron or convolutional neural network.
+
+    In addition to :math:`\\xi(\cdot)`, we can also include deterministic
+    mappings for the likelihood of observations. We denote the parameters of
+    the deterministic mappings as :math:`\eta`. An example of such mappings is
+    the deconvolutional neural network used in the convolutional VAE example
+    in the PyMC3 notebook directory.
+
+    This function maximizes the evidence lower bound (ELBO)
+    :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:
 
+    .. math::
+
+        {\cal L}(\gamma,\\nu,\eta) & =
+        \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
+        \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
+        \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
+        \\right]\\right] \\\\ &
+        - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
+        - \mathbf{c}_{l}\sum_{i=1}^{N}
+            KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],
+
+    where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence
+
+    .. math::
+
+        KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,
+
+    :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
+    More precisely, we can write each of the terms in ELBO as follows:
+
+    .. math::
+
+        \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
+        \sum_{k=1}^{V_{o}}c_{o}^{k}
+            \log p(\mathbf{y}_{i}^{k}|
+                   {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
+        \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
+        \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
+            q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
+        \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
+        \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
+            q(\mathbf{z}_{i}^{k})||
+            p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],
+
+    where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
+    in the directed acyclic graph of the model.
+
+    When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
+    set to :math:`N/M`, where :math:`M` is the number of observations in each
+    mini-batch. This is done with supplying :code:`total_size` parameter to 
+    observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`).
+    In this case it is possible to automatically determine appropriate scaling for :math:`logp`
+    of observed nodes. Interesting to note that it is possible to have two independent 
+    observed variables with different :code:`total_size` and iterate them independently
+    during inference.  
+
+    For working with ADVI, we need to give 
+    -   The probabilistic model
+        (:code:`model`), the three types of RVs (:code:`observed_RVs`,
+        :code:`global_RVs` and :code:`local_RVs`). 
+    
+    -   (optional) Minibatches
+        The tensors to which mini-bathced samples are supplied are 
+        handled separately by using callbacks in :code:`.fit` method 
+        that change storage of shared theano variable or by :code:`pm.generator` 
+        that automatically iterates over minibatches and defined beforehand. 
+    
+    -   (optional) Parameters of deterministic mappings
+        They have to be passed along with other params to :code:`.fit` method 
+        as :code:`more_obj_params` argument. 
+    
+    For more information concerning training stage please reference 
+    :code:`pymc3.variational.opvi.ObjectiveFunction.step_function`
+    
     Parameters
     ----------
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -246,8 +355,8 @@ class FullRankADVI(Inference):
 
     Parameters
     ----------
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 28ff46b3a9..cb5d0cf012 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -1,3 +1,36 @@
+"""
+Variational inference is a great approach for doing really complex, 
+often intractable Bayesian inference in approximate form. Common methods 
+(e.g. ADVI) lack from complexity so that approximate posterior does not 
+reveal the true nature of underlying problem. In some applications it can 
+yield unreliable decisions. 
+
+Recently on NIPS 2017 [OPVI](https://arxiv.org/abs/1610.09033) framework 
+was presented. It generalizes variational inverence so that the problem is 
+build with blocks. The first and essential block is Model itself. Second is 
+Approximation, in some cases :math:`log Q(D)` is not really needed. Necessity 
+depends on the third and forth part of that black box, Operator and 
+Test Function respectively. 
+
+Operator is like an approach we use, it constructs loss from given Model, 
+Approximation and Test Function. The last one is not needed if we minimize 
+KL Divergence from Q to posterior. As a drawback we need to compute :math:`loq Q(D)`. 
+Sometimes approximation family is intractable and :math:`loq Q(D)` is not available, 
+here comes LS(Langevin Stein) Operator with a set of test functions.
+
+Test Function has more unintuitive meaning. It is usually used with LS operator 
+and represents all we want from our approximate distribution. For any given vector 
+based function of :math:`z` LS operator yields zero mean function under posterior. 
+:math:`loq Q(D)` is no more needed. That opens a door to rich approximation 
+families as neural networks.
+
+References
+----------
+-   Rajesh Ranganath, Jaan Altosaar, Dustin Tran, David M. Blei 
+    Operator Variational Inference 
+    https://arxiv.org/abs/1610.09033 (2016)
+"""
+
 import warnings
 import numpy as np
 import theano
@@ -251,7 +284,7 @@ class Operator(object):
 
     Subclassing
     -----------
-    For implementing Custom operator it is needed to define `.apply(f)` method
+    For implementing Custom operator it is needed to define :code:`.apply(f)` method
     """
 
     HAS_TEST_FUNCTION = False
@@ -417,8 +450,8 @@ class Approximation(object):
 
     Parameters
     ----------
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -439,37 +472,49 @@ class Approximation(object):
     -----------
     Defining an approximation needs
     custom implementation of the following methods:
-        - `.create_shared_params()`
+        - :code:`.create_shared_params(**kwargs)`
             Returns {dict|list|theano.shared}
 
-        - `.random_global(size=None, no_rand=False)`
+        - :code:`.random_global(size=None, no_rand=False)`
             Generate samples from posterior. If `no_rand==False`:
             sample from MAP of initial distribution.
             Returns TensorVariable
 
-        - `.log_q_W_global(z)`
+        - :code:`.log_q_W_global(z)`
             It is needed only if used with operator
             that requires :math:`logq` of an approximation
             Returns Scalar
+            
+    You can also override the following methods:
+        - :code:`._setup(**kwargs)`
+            Do some specific stuff having :code:`kwargs` before calling :code:`.create_shared_params`
+            
+        - :code:`.check_model(model, **kwargs)`
+            Do some specific check for model having :code:`kwargs`
 
     Notes
     -----
-    There are some defaults for approximation classes that can be
+    :code:`kwargs` mentioned above are supplied as additional arguments 
+    for :code:`Approximation.__init__`
+    
+    There are some defaults class attributes for approximation classes that can be
     optionally overriden.
-        - `initial_dist_name`
+        - :code:`initial_dist_name`
             string that represents name of the initial distribution.
             In most cases if will be `uniform` or `normal`
-        - `initial_dist_map`
-            float where initial distribution has maximum density
 
+        - :code:`initial_dist_map`
+            float where initial distribution has maximum density
+        
+        
     References
     ----------
-    - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
+    -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
 
-    - Kingma, D. P., & Welling, M. (2014).
-      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    -   Kingma, D. P., & Welling, M. (2014).
+        Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
     initial_dist_name = 'normal'
     initial_dist_map = 0.

From 1d549f615c58e230bae74342b8956833ea650771 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 21 Apr 2017 00:13:00 +0300
Subject: [PATCH 25/28] add Inference to api reference

---
 docs/source/api/inference.rst | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/docs/source/api/inference.rst b/docs/source/api/inference.rst
index efbefb82dd..c6f0eeb8c8 100644
--- a/docs/source/api/inference.rst
+++ b/docs/source/api/inference.rst
@@ -50,26 +50,34 @@ Hamiltonian Monte Carlo
 Variational
 -----------
 
-ADVI
+OPVI
 ^^^^
 
-.. currentmodule:: pymc3.variational.advi
+.. currentmodule:: pymc3.variational.opvi
 
-.. automodule:: pymc3.variational.advi
+.. automodule:: pymc3.variational.opvi
    :members:
 
-ADVI minibatch
-^^^^^^^^^^^^^^
+Inference
+^^^^^^^^^
 
-.. currentmodule:: pymc3.variational.advi_minibatch
+.. currentmodule:: pymc3.variational.inference
 
-.. automodule:: pymc3.variational.advi_minibatch
+.. automodule:: pymc3.variational.inference
    :members:
 
-ADVI approximations
-^^^^^^^^^^^^^^^^^^^
+Approximations
+^^^^^^^^^^^^^^
 
 .. currentmodule:: pymc3.variational.approximations
 
 .. automodule:: pymc3.variational.approximations
    :members:
+
+Operators
+^^^^^^^^^
+
+.. currentmodule:: pymc3.variational.operators
+
+.. automodule:: pymc3.variational.operators
+   :members:

From bc56865c1a96e30213029bc99f1413124cde3e28 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 21 Apr 2017 00:13:57 +0300
Subject: [PATCH 26/28] fix typo in doc

---
 docs/source/api/data.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/api/data.rst b/docs/source/api/data.rst
index bf5b65d898..8febe13e63 100644
--- a/docs/source/api/data.rst
+++ b/docs/source/api/data.rst
@@ -1,6 +1,6 @@
-*****
+****
 Data
-*****
+****
 
 .. currentmodule:: pymc3.data
 

From e80893900374b2ea6b0a72ac38b3e6f0ddc3a768 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 21 Apr 2017 00:30:10 +0300
Subject: [PATCH 27/28] make approximation docs more verbose

---
 pymc3/variational/approximations.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index a96518e30c..fdf0563972 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -25,8 +25,8 @@ class MeanField(Approximation):
 
     Parameters
     ----------
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -100,8 +100,8 @@ class FullRank(Approximation):
 
     Parameters
     ----------
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -240,9 +240,9 @@ class Empirical(Approximation):
     Parameters
     ----------
     trace : MultiTrace
-    local_rv : dict
-        Experimental for Histogram approximation
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        Experimental for Empirical Distribution
+        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 

From 8f99a2c3ac4c0a9290ce94a1fa6e884c0ab5a803 Mon Sep 17 00:00:00 2001
From: Maxim Kochurov <maxim.v.kochurov@gmail.com>
Date: Fri, 21 Apr 2017 21:09:31 +0300
Subject: [PATCH 28/28] fix typos

---
 pymc3/variational/approximations.py | 8 ++++----
 pymc3/variational/inference.py      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index fdf0563972..98dd2e9e4c 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -26,7 +26,7 @@ class MeanField(Approximation):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -101,7 +101,7 @@ class FullRank(Approximation):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -241,8 +241,8 @@ class Empirical(Approximation):
     ----------
     trace : MultiTrace
     local_rv : dict[var->tuple]
-        Experimental for Empirical Distribution
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        Experimental for Empirical Approximation
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 04f087306e..9b6dcafeaf 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -292,7 +292,7 @@ class ADVI(Inference):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -356,7 +356,7 @@ class FullRankADVI(Inference):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
 
@@ -523,8 +523,8 @@ def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, **kwargs):
     ----------
     n : int
         number of iterations
-    local_rv : dict
-        mapping {model_variable -> local_variable}
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
     method : str or Inference