diff --git a/pymc3/data.py b/pymc3/data.py
index 944ce55458..fcac737719 100644
--- a/pymc3/data.py
+++ b/pymc3/data.py
@@ -1,7 +1,10 @@
+import itertools
 import pkgutil
 import io
 
-__all__ = ['get_data_file']
+import theano.tensor as tt
+
+__all__ = ['get_data_file', 'DataGenerator']
 
 
 def get_data_file(pkg, path):
@@ -19,3 +22,28 @@ def get_data_file(pkg, path):
     """
 
     return io.BytesIO(pkgutil.get_data(pkg, path))
+
+
+class DataGenerator(object):
+    """
+    Helper class that helps to infer data type of generator with looking
+    at the first item, preserving the order of the resulting generator
+    """
+    def __init__(self, generator):
+        self.test_value = next(generator)
+        self.gen = itertools.chain([self.test_value], generator)
+        self.tensortype = tt.TensorType(
+            self.test_value.dtype,
+            ((False, ) * self.test_value.ndim))
+
+    def __next__(self):
+        return next(self.gen)
+
+    def __iter__(self):
+        return self
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
+    def __hash__(self):
+        return hash(id(self))
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
index 90ae896529..0c524fda4c 100644
--- a/pymc3/distributions/dist_math.py
+++ b/pymc3/distributions/dist_math.py
@@ -10,6 +10,9 @@
 
 from .special import gammaln, multigammaln
 
+c = - 0.5 * np.log(2 * np.pi)
+
+
 def bound(logp, *conditions, **kwargs):
     """
     Bounds a log probability density with several conditions.
@@ -95,3 +98,64 @@ def i1(x):
                      x**9 / 1474560 + x**11 / 176947200 + x**13 / 29727129600,
                      np.e**x / (2 * np.pi * x)**0.5 * (1 - 3 / (8 * x) + 15 / (128 * x**2) + 315 / (3072 * x**3)
                                                        + 14175 / (98304 * x**4)))
+
+
+def sd2rho(sd):
+    """
+    `sd -> rho` theano converter
+
+    :math:`mu + sd*e = mu + log(1+exp(rho))*e`"""
+    return tt.log(tt.exp(sd) - 1)
+
+
+def rho2sd(rho):
+    """
+    `rho -> sd` theano converter
+
+    :math:`mu + sd*e = mu + log(1+exp(rho))*e`"""
+    return tt.log1p(tt.exp(rho))
+
+
+def log_normal(x, mean, **kwargs):
+    """
+    Calculate logarithm of normal distribution at point `x`
+    with given `mean` and `std`
+
+    Parameters
+    ----------
+    x : Tensor
+        point of evaluation
+    mean : Tensor
+        mean of normal distribution
+    kwargs : one of parameters `{sd, tau, w, rho}`
+
+    Notes
+    -----
+    There are four variants for density parametrization.
+    They are:
+        1) standard deviation - `std`
+        2) `w`, logarithm of `std` :math:`w = log(std)`
+        3) `rho` that follows this equation :math:`rho = log(exp(std) - 1)`
+        4) `tau` that follows this equation :math:`tau = std^{-1}`
+    ----
+    """
+    sd = kwargs.get('sd')
+    w = kwargs.get('w')
+    rho = kwargs.get('rho')
+    tau = kwargs.get('tau')
+    eps = kwargs.get('eps', 0.0)
+    check = sum(map(lambda a: a is not None, [sd, w, rho, tau]))
+    if check > 1:
+        raise ValueError('more than one required kwarg is passed')
+    if check == 0:
+        raise ValueError('none of required kwarg is passed')
+    if sd is not None:
+        std = sd
+    elif w is not None:
+        std = tt.exp(w)
+    elif rho is not None:
+        std = rho2sd(rho)
+    else:
+        std = tau**(-1)
+    std += eps
+    return c - tt.log(tt.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2)
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 2ad2243f74..2a38b91aea 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -8,7 +8,7 @@
 from .dist_math import bound
 
 
-__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Bound', 
+__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Bound',
            'Discrete', 'NoDistribution', 'TensorType', 'draw_values']
 
 
@@ -30,8 +30,9 @@ def __new__(cls, name, *args, **kwargs):
 
         if isinstance(name, string_types):
             data = kwargs.pop('observed', None)
+            total_size = kwargs.pop('total_size', None)
             dist = cls.dist(*args, **kwargs)
-            return model.Var(name, dist, data)
+            return model.Var(name, dist, data, total_size)
         else:
             raise TypeError("Name needs to be a string but got: %s" % name)
 
@@ -410,7 +411,7 @@ def __init__(self, distribution, lower, upper, transform='infer', *args, **kwarg
                 self.transform = transforms.upperbound(upper)
                 if default >= upper:
                     self.testval = upper - 1
-                    
+
         if issubclass(distribution, Discrete):
             self.transform = None
 
diff --git a/pymc3/math.py b/pymc3/math.py
index 97c6d92b50..adb34bef7a 100644
--- a/pymc3/math.py
+++ b/pymc3/math.py
@@ -22,3 +22,7 @@ def invlogit(x, eps=sys.float_info.epsilon):
 
 def logit(p):
     return tt.log(p / (1 - p))
+
+
+def flatten_list(tensors):
+    return tt.concatenate([var.ravel() for var in tensors])
diff --git a/pymc3/model.py b/pymc3/model.py
index 6017484d06..8c480c87e0 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -8,7 +8,7 @@
 
 import pymc3 as pm
 from .memoize import memoize
-from .theanof import gradient, hessian, inputvars
+from .theanof import gradient, hessian, inputvars, generator
 from .vartypes import typefilter, discrete_types, continuous_types
 from .blocking import DictToArrayBijection, ArrayOrdering
 
@@ -458,7 +458,7 @@ def cont_vars(self):
         """All the continuous variables in the model"""
         return list(typefilter(self.vars, continuous_types))
 
-    def Var(self, name, dist, data=None):
+    def Var(self, name, dist, data=None, total_size=None):
         """Create and add (un)observed random variable to the model with an
         appropriate prior distribution.
 
@@ -469,6 +469,8 @@ def Var(self, name, dist, data=None):
         data : array_like (optional)
            If data is provided, the variable is observed. If None,
            the variable is unobserved.
+        total_size : scalar
+            upscales logp of variable with :math:`coef = total_size/var.shape[0]`
 
         Returns
         -------
@@ -477,11 +479,13 @@ def Var(self, name, dist, data=None):
         name = self.name_for(name)
         if data is None:
             if getattr(dist, "transform", None) is None:
-                var = FreeRV(name=name, distribution=dist, model=self)
+                var = FreeRV(name=name, distribution=dist, model=self,
+                             total_size=total_size)
                 self.free_RVs.append(var)
             else:
                 var = TransformedRV(name=name, distribution=dist, model=self,
-                                    transform=dist.transform)
+                                    transform=dist.transform,
+                                    total_size=total_size)
                 pm._log.debug('Applied {transform}-transform to {name}'
                               ' and added transformed {orig_name} to model.'.format(
                                 transform=dist.transform.name,
@@ -491,7 +495,7 @@ def Var(self, name, dist, data=None):
                 return var
         elif isinstance(data, dict):
             var = MultiObservedRV(name=name, data=data, distribution=dist,
-                                  model=self)
+                                  model=self, total_size=total_size)
             self.observed_RVs.append(var)
             if var.missing_values:
                 self.free_RVs += var.missing_values
@@ -500,7 +504,8 @@ def Var(self, name, dist, data=None):
                     self.named_vars[v.name] = v
         else:
             var = ObservedRV(name=name, data=data,
-                             distribution=dist, model=self)
+                             distribution=dist, model=self,
+                             total_size=total_size)
             self.observed_RVs.append(var)
             if var.missing_values:
                 self.free_RVs.append(var.missing_values)
@@ -717,7 +722,7 @@ class FreeRV(Factor, TensorVariable):
     """Unobserved random variable that a model is specified in terms of."""
 
     def __init__(self, type=None, owner=None, index=None, name=None,
-                 distribution=None, model=None):
+                 distribution=None, model=None, total_size=None):
         """
         Parameters
         ----------
@@ -725,7 +730,10 @@ def __init__(self, type=None, owner=None, index=None, name=None,
         owner : theano owner (optional)
         name : str
         distribution : Distribution
-        model : Model"""
+        model : Model
+        total_size : scalar Tensor (optional)
+            needed for upscaling logp
+        """
         if type is None:
             type = distribution.type
         super(FreeRV, self).__init__(type, owner, index, name)
@@ -736,7 +744,14 @@ def __init__(self, type=None, owner=None, index=None, name=None,
             self.distribution = distribution
             self.tag.test_value = np.ones(
                 distribution.shape, distribution.dtype) * distribution.default()
-            self.logp_elemwiset = distribution.logp(self)
+            logp_elemwiset = distribution.logp(self)
+            if total_size is None:
+                coef = tt.as_tensor(1)
+            else:
+                assert logp_elemwiset.ndim >= 1, ('Variable with scaled density '
+                                                  'needs to be at least 1 dimensional')
+                coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0]
+            self.logp_elemwiset = logp_elemwiset * coef
             self.model = model
 
             incorporate_methods(source=distribution, destination=self,
@@ -759,6 +774,8 @@ def pandas_to_array(data):
         return data
     elif isinstance(data, theano.gof.graph.Variable):
         return data
+    elif hasattr(data, '__next__'):
+        return generator(data)
     else:
         return np.asarray(data)
 
@@ -792,7 +809,7 @@ class ObservedRV(Factor, TensorVariable):
     """
 
     def __init__(self, type=None, owner=None, index=None, name=None, data=None,
-                 distribution=None, model=None):
+                 distribution=None, model=None, total_size=None):
         """
         Parameters
         ----------
@@ -801,6 +818,8 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None,
         name : str
         distribution : Distribution
         model : Model
+        total_size : scalar Tensor (optional)
+            needed for upscaling logp
         """
         from .distributions import TensorType
         if type is None:
@@ -814,7 +833,14 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None,
             data = as_tensor(data, name, model, distribution)
             self.missing_values = data.missing_values
 
-            self.logp_elemwiset = distribution.logp(data)
+            logp_elemwiset = distribution.logp(data)
+            if total_size is None:
+                coef = tt.as_tensor(1)
+            else:
+                assert logp_elemwiset.ndim >= 1, ('Variable with scaled density '
+                                                  'needs to be at least 1 dimensional')
+                coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0]
+            self.logp_elemwiset = logp_elemwiset * coef
             self.model = model
             self.distribution = distribution
 
@@ -835,7 +861,7 @@ class MultiObservedRV(Factor):
     Potentially partially observed.
     """
 
-    def __init__(self, name, data, distribution, model):
+    def __init__(self, name, data, distribution, model, total_size=None):
         """
         Parameters
         ----------
@@ -844,6 +870,8 @@ def __init__(self, name, data, distribution, model):
         name : str
         distribution : Distribution
         model : Model
+        total_size : scalar Tensor (optional)
+            needed for upscaling logp
         """
         self.name = name
         self.data = {name: as_tensor(data, name, model, distribution)
@@ -851,7 +879,14 @@ def __init__(self, name, data, distribution, model):
 
         self.missing_values = [datum.missing_values for datum in self.data.values()
                                if datum.missing_values is not None]
-        self.logp_elemwiset = distribution.logp(**self.data)
+        logp_elemwiset = distribution.logp(**self.data)
+        if total_size is None:
+            coef = tt.as_tensor(1)
+        else:
+            assert logp_elemwiset.ndim >= 1, ('Variable with scaled density '
+                                              'needs to be at least 1 dimensional')
+            coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0]
+        self.logp_elemwiset = logp_elemwiset * coef
         self.model = model
         self.distribution = distribution
 
@@ -896,17 +931,20 @@ def Potential(name, var, model=None):
 class TransformedRV(TensorVariable):
 
     def __init__(self, type=None, owner=None, index=None, name=None,
-                 distribution=None, model=None, transform=None):
+                 distribution=None, model=None, transform=None,
+                 total_size=None):
         """
         Parameters
         ----------
 
         type : theano type (optional)
         owner : theano owner (optional)
-
         name : str
         distribution : Distribution
-        model : Model"""
+        model : Model
+        total_size : scalar Tensor (optional)
+            needed for upscaling logp
+        """
         if type is None:
             type = distribution.type
         super(TransformedRV, self).__init__(type, owner, index, name)
@@ -916,7 +954,7 @@ def __init__(self, type=None, owner=None, index=None, name=None,
 
             transformed_name = "{}_{}_".format(name, transform.name)
             self.transformed = model.Var(
-                transformed_name, transform.apply(distribution))
+                transformed_name, transform.apply(distribution), total_size=total_size)
 
             normalRV = transform.backward(self.transformed)
 
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 4eb96e6789..0b9696a4ae 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -1,5 +1,5 @@
 import unittest
-import theano.tensor as tt
+from theano import theano, tensor as tt
 import pymc3 as pm
 from pymc3.distributions import HalfCauchy, Normal
 from pymc3 import Potential, Deterministic
@@ -118,3 +118,12 @@ def test_model_root(self):
             self.assertTrue(model is model.root)
             with pm.Model() as sub:
                 self.assertTrue(model is sub.root)
+
+    def test_density_scaling(self):
+        with pm.Model() as model1:
+            Normal('n', observed=[[1]], total_size=1)
+            p1 = theano.function([], model1.logpt)
+        with pm.Model() as model2:
+            Normal('n', observed=[[1]], total_size=2)
+            p2 = theano.function([], model2.logpt)
+        self.assertEqual(p1() * 2, p2())
diff --git a/pymc3/tests/test_theanof.py b/pymc3/tests/test_theanof.py
new file mode 100644
index 0000000000..ebc7308781
--- /dev/null
+++ b/pymc3/tests/test_theanof.py
@@ -0,0 +1,50 @@
+import itertools
+import unittest
+import numpy as np
+import theano
+from ..theanof import DataGenerator, GeneratorOp, generator
+
+
+def integers():
+    i = 0
+    while True:
+        yield np.float32(i)
+        i += 1
+
+
+def integers_ndim(ndim):
+    i = 0
+    while True:
+        yield np.ones((2,) * ndim) * i
+        i += 1
+
+
+class TestGenerator(unittest.TestCase):
+    def test_basic(self):
+        generator = DataGenerator(integers())
+        gop = GeneratorOp(generator)()
+        self.assertEqual(gop.tag.test_value, np.float32(0))
+        f = theano.function([], gop)
+        self.assertEqual(f(), np.float32(0))
+        self.assertEqual(f(), np.float32(1))
+        for i in range(2, 100):
+            f()
+        self.assertEqual(f(), np.float32(100))
+
+    def test_ndim(self):
+        for ndim in range(10):
+            res = list(itertools.islice(integers_ndim(ndim), 0, 2))
+            generator = DataGenerator(integers_ndim(ndim))
+            gop = GeneratorOp(generator)()
+            f = theano.function([], gop)
+            self.assertEqual(ndim, res[0].ndim)
+            np.testing.assert_equal(f(), res[0])
+            np.testing.assert_equal(f(), res[1])
+
+    def test_cloning_available(self):
+        gop = generator(integers())
+        res = gop ** 2
+        shared = theano.shared(np.float32(10))
+        res1 = theano.clone(res, {gop: shared})
+        f = theano.function([], res1)
+        self.assertEqual(f(), np.float32(100))
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
new file mode 100644
index 0000000000..0b75a69b6a
--- /dev/null
+++ b/pymc3/tests/test_variational_inference.py
@@ -0,0 +1,112 @@
+import unittest
+import numpy as np
+import theano
+import theano.tensor as tt
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+import pymc3 as pm
+from pymc3 import Model, Normal
+from pymc3.variational.approximations import ADVI
+from pymc3.variational.inference import approximate
+from ..theanof import set_tt_rng
+from . import models
+set_tt_rng(MRG_RandomStreams(42))
+
+
+class TestApproximates:
+    class Base(unittest.TestCase):
+        approx = None
+        NITER = 30000
+
+        def test_elbo(self):
+            if self.approx is not ADVI:
+                raise unittest.SkipTest
+            mu0 = 1.5
+            sigma = 1.0
+            y_obs = np.array([1.6, 1.4])
+
+            post_mu = np.array([1.88])
+            post_sd = np.array([1])
+            # Create a model for test
+            with Model() as model:
+                mu = Normal('mu', mu=mu0, sd=sigma)
+                Normal('y', mu=mu, sd=1, observed=y_obs)
+
+            # Create variational gradient tensor
+            mean_field = self.approx(model=model)
+            elbo = mean_field.elbo(samples=10000)
+
+            mean_field.shared_params['mu'].set_value(post_mu)
+            mean_field.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+
+            f = theano.function([], elbo.mean())
+            elbo_mc = f()
+
+            # Exact value
+            elbo_true = (-0.5 * (
+                3 + 3 * post_mu**2 - 2 * (y_obs[0] + y_obs[1] + mu0) * post_mu +
+                y_obs[0]**2 + y_obs[1]**2 + mu0**2 + 3 * np.log(2 * np.pi)) +
+                0.5 * (np.log(2 * np.pi) + 1))
+            np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1)
+
+        def test_vary_samples(self):
+            _, model, _ = models.simple_model()
+            i = tt.iscalar('i')
+            i.tag.test_value = 1
+            with model:
+                mf = self.approx()
+                elbo = mf.elbo(i)
+            elbos = theano.function([i], elbo)
+            self.assertEqual(elbos(1).shape[0], 1)
+            self.assertEqual(elbos(10).shape[0], 10)
+
+        def test_vars_view(self):
+            _, model, _ = models.multidimensional_model()
+            with model:
+                mf = self.approx()
+                posterior = mf.posterior(10)
+                x_sampled = mf.view_from(posterior, 'x').eval()
+            self.assertEqual(x_sampled.shape, (10,) + model['x'].dshape)
+
+        def test_sample_vp(self):
+            n_samples = 100
+            xs = np.random.binomial(n=1, p=0.2, size=n_samples)
+            with pm.Model():
+                p = pm.Beta('p', alpha=1, beta=1)
+                pm.Binomial('xs', n=1, p=p, observed=xs)
+                mf = self.approx()
+                trace = mf.sample_vp(draws=1, hide_transformed=True)
+                self.assertListEqual(trace.varnames, ['p'])
+                trace = mf.sample_vp(draws=1, hide_transformed=False)
+                self.assertListEqual(sorted(trace.varnames), ['p', 'p_logodds_'])
+
+        def test_advi_optimizer(self):
+            n = 1000
+            sd0 = 2.
+            mu0 = 4.
+            sd = 3.
+            mu = -5.
+
+            data = sd * np.random.randn(n) + mu
+
+            d = n / sd ** 2 + 1 / sd0 ** 2
+            mu_post = (n * np.mean(data) / sd ** 2 + mu0 / sd0 ** 2) / d
+
+            with Model():
+                mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
+                Normal('x', mu=mu_, sd=sd, observed=data)
+                pm.Deterministic('mu_sq', mu_**2)
+                mf = self.approx()
+                approximate(self.NITER, method=mf)
+                trace = mf.sample_vp(10000)
+
+            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
+            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
+            np.testing.assert_allclose(mf.median['mu'], mu_post, rtol=0.1)
+            np.testing.assert_allclose(mf.median['mu_sq'], mu_post**2, rtol=0.1)
+
+
+class TestMeanField(TestApproximates.Base):
+    approx = ADVI
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pymc3/theanof.py b/pymc3/theanof.py
index 56bad3fb0e..ae8648d926 100644
--- a/pymc3/theanof.py
+++ b/pymc3/theanof.py
@@ -1,15 +1,47 @@
 import numpy as np
 
+import contextlib
 from .vartypes import typefilter, continuous_types
 from theano import theano, scalar, tensor as tt
 from theano.gof.graph import inputs
+from theano.configparser import change_flags
+from theano.tensor import TensorVariable
+from theano.gof import Op
+from theano.configparser import change_flags
 from .memoize import memoize
 from .blocking import ArrayOrdering
-
-__all__ = ['gradient', 'hessian', 'hessian_diag', 'inputvars',
-           'cont_inputs', 'floatX', 'jacobian',
-           'CallableTensor', 'join_nonshared_inputs',
-           'make_shared_replacements']
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+from .data import DataGenerator
+
+
+__all__ = ['gradient',
+           'hessian',
+           'hessian_diag',
+           'inputvars',
+           'cont_inputs',
+           'jacobian',
+           'CallableTensor',
+           'join_nonshared_inputs',
+           'make_shared_replacements',
+           'tt_rng',
+           'set_tt_rng',
+           'no_test_val',
+           'change_flags',
+           'generator']
+
+
+@contextlib.contextmanager
+def no_test_val():
+    """for functions use theano.configparser.change_flags
+
+    Usage
+    -----
+    >>> with no_test_val():
+        ...
+    """
+    theano.config.compute_test_value = 'off'
+    yield
+    theano.config.compute_test_value = 'raise'
 
 
 def inputvars(a):
@@ -242,3 +274,86 @@ def __call__(self, input):
 
 scalar_identity = IdentityOp(scalar.upgrade_to_float, name='scalar_identity')
 identity = tt.Elemwise(scalar_identity, name='identity')
+
+
+@change_flags(compute_test_value='off')
+def launch_rng(rng):
+    """Helper function for safe launch of rng.
+    If not launched, there will be problems with test_value
+
+    Parameters
+    ----------
+    rng : theano.sandbox.rng_mrg.MRG_RandomStreams` instance
+    """
+    state = rng.rstate
+    rng.inc_rstate()
+    rng.set_rstate(state)
+
+_tt_rng = MRG_RandomStreams()
+launch_rng(_tt_rng)
+
+
+def tt_rng():
+    """Get the package-level random number generator.
+    Returns
+    -------
+    `theano.sandbox.rng_mrg.MRG_RandomStreams` instance
+        `theano.sandbox.rng_mrg.MRG_RandomStreams`
+        instance passed to the most recent call of `set_tt_rng`
+    """
+    return _tt_rng
+
+
+def set_tt_rng(new_rng):
+    """Set the package-level random number generator.
+    Parameters
+    ----------
+    new_rng : `theano.sandbox.rng_mrg.MRG_RandomStreams` instance
+        The random number generator to use.
+    """
+    global _tt_rng
+    _tt_rng = new_rng
+    launch_rng(_tt_rng)
+
+
+class GeneratorOp(Op):
+    """
+    Generaror Op is designed for storing python generators
+    inside theano graph. The main limitation is generator itself.
+
+    There are some important cases when generator becomes exhausted
+        - not endless generator is passed
+        - exception is raised while `generator.__next__` is performed
+            Note: it is dangerous in simple python generators, but ok in
+            custom class based generators with explicit state
+        - data type on each iteration should be the same
+    """
+    __props__ = ('generator',)
+
+    def __init__(self, gen):
+        if not isinstance(gen, DataGenerator):
+            gen = DataGenerator(gen)
+        super(GeneratorOp, self).__init__()
+        self.generator = gen
+        self.itypes = []
+        self.otypes = [self.generator.tensortype]
+
+    def perform(self, node, inputs, output_storage, params=None):
+        try:
+            output_storage[0][0] = self.generator.__next__()
+        except StopIteration:
+            output_storage[0][0] = np.nan
+
+    def do_constant_folding(self, node):
+        return False
+
+    @change_flags(compute_test_value='off')
+    def __call__(self, *args, **kwargs):
+        rval = super(GeneratorOp, self).__call__(*args, **kwargs)
+        rval.tag.test_value = self.generator.test_value
+        return rval
+
+
+def generator(gen):
+    """shortcut for `GeneratorOp`"""
+    return GeneratorOp(gen)()
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
new file mode 100644
index 0000000000..42caf4bd72
--- /dev/null
+++ b/pymc3/variational/approximations.py
@@ -0,0 +1,597 @@
+import collections
+import theano.tensor as tt
+from theano.ifelse import ifelse
+import theano
+import pymc3 as pm
+from ..theanof import tt_rng, memoize, change_flags
+from ..blocking import ArrayOrdering
+from ..distributions.dist_math import rho2sd, log_normal
+from ..math import flatten_list
+from ..model import modelcontext
+import numpy as np
+
+# helper class
+FlatView = collections.namedtuple('FlatView', 'input, replacements')
+# shortcut for zero grad
+Z = theano.gradient.zero_grad
+
+
+class BaseApproximation(object):
+    """
+    Base class for variational methods
+
+    It uses the idea that we can map initial standard normal
+    distribution to the approximate posterior distribution.
+
+    Three methods are required for implementation:
+        1) creating shared parameters `create_shared_params()`
+        2) mapping initial distribution to posterior `posterior{_{global|local}}(initial)`
+        3) calculating log_q_W for approximation `log_q_W{_{global|local}}(posterior)`
+
+    Then ELBO can be calculated for given approximation
+        `log_p_D(posterior) - KL_q_p_W(posterior)`
+
+    Parameters
+    ----------
+    population : dict[Variable->int]
+        maps observed_RV to its population size
+        if not provided defaults to full population
+        Note: population size is `shape[0]` of the whole data
+    known : dict[Variable->(mu, rho)]
+        maps local random variable to mu and rho for posterior parametrization
+        it is used for Autoencoding Variational Bayes
+        (AEVB; Kingma and Welling, 2014)[1]_
+    model : Model
+        PyMC3 model which posterior is going to be approximated
+
+    References
+    ----------
+    .. [1] Kingma, D. P., & Welling, M. (2014).
+      Auto-Encoding Variational Bayes. stat, 1050, 1.
+    """
+    def __init__(self, population=None, known=None, model=None):
+        model = modelcontext(model)
+        self.check_model(model)
+        if known is None:
+            known = dict()
+        if population is None:
+            population = dict()
+        self.population = population
+        self.model = model
+
+        def get_transformed(v):
+            if hasattr(v, 'transformed'):
+                return v.transformed
+            return v
+        known = {get_transformed(k): v for k, v in known.items()}
+        self.known = known
+        self.local_vars = [v for v in model.free_RVs if v in known]
+        self.global_vars = [v for v in model.free_RVs if v not in known]
+        self.order = ArrayOrdering(self.local_vars + self.global_vars)
+        inputvar = tt.vector('flat_view', dtype=theano.config.floatX)
+        inputvar.tag.test_value = flatten_list(self.local_vars + self.global_vars).tag.test_value
+        replacements = {self.model.named_vars[name]: inputvar[slc].reshape(shape).astype(dtype)
+                        for name, slc, shape, dtype in self.order.vmap}
+        self.flat_view = FlatView(inputvar, replacements)
+        self.view = {vm.var: vm for vm in self.order.vmap}
+        self.shared_params = self.create_shared_params()
+
+    def set_params(self, params):
+        self.shared_params.update(params)
+
+    @property
+    def params(self):
+        return list(self.shared_params.values())
+
+    @staticmethod
+    def check_model(model):
+        """
+        Checks that model is valid for variational inference
+        """
+        vars_ = [var for var in model.vars if not isinstance(var, pm.model.ObservedRV)]
+        if any([var.dtype in pm.discrete_types for var in vars_]):
+            raise ValueError('Model should not include discrete RVs')
+
+    @property
+    def input(self):
+        """
+        Shortcut to flattened input
+        """
+        return self.flat_view.input
+
+    def create_shared_params(self):
+        """
+        Any stuff you need will be here
+
+        Returns
+        -------
+        dict : shared params
+        """
+        raise NotImplementedError
+
+    def initial(self, samples, subset='all', zeros=False):
+        """
+        Initial distribution for constructing posterior
+
+        Parameters
+        ----------
+        samples : int - number of samples
+        subset : determines what space is used can be {all|local|global}
+        zeros : bool - return zeros if True
+
+        Returns
+        -------
+        Tensor
+            sampled latent space shape(samples, size)
+        """
+        assert subset in {'all', 'local', 'global'}
+        if subset == 'all':
+            size = self.total_size
+        elif subset == 'global':
+            size = self.global_size
+        else:
+            size = self.local_size
+        shape = tt.stack([tt.as_tensor(samples), tt.as_tensor(size)])
+        if isinstance(zeros, bool):
+            zeros = int(zeros)
+        zeros = tt.as_tensor(zeros)
+        space = ifelse(zeros,
+                       tt.zeros(shape),
+                       tt_rng().normal(shape))
+        return space
+
+    def posterior(self, samples=1, zeros=False):
+        """
+        Transforms initial latent space to posterior distribution
+
+        Parameters
+        ----------
+        samples : number of samples
+        zeros : set initial distribution to zeros
+
+        Returns
+        -------
+        Tensor
+            posterior space
+        """
+        if self.local_vars and self.global_vars:
+            return tt.concatenate([
+                self.posterior_local(self.initial(samples, 'local', zeros)),
+                self.posterior_global(self.initial(samples, 'global', zeros))
+                ], axis=1)
+        elif self.local_vars:
+            return self.posterior_local(self.initial(samples, 'local', zeros))
+        elif self.global_vars:
+            return self.posterior_global(self.initial(samples, 'global', zeros))
+        else:
+            raise ValueError('No FreeVARs in model')
+
+    def sample_over_space(self, space, node):
+        """
+        Take samples of node using `space` as distribution of random variables
+
+        Parameters
+        ----------
+        space : space to sample over
+        node : node that has flattened input
+
+        Returns
+        -------
+        samples
+        """
+        def replace_node(post):
+            return theano.clone(node, {self.input: post})
+        samples, _ = theano.scan(replace_node, space, n_steps=space.shape[0])
+        return samples
+
+    def view_from(self, space, name, subset='all', reshape=True):
+        """
+        Construct view on a variable from flattened `space`
+
+        Parameters
+        ----------
+        space : space to take view of variable from
+        name : str
+            name of variable
+        subset : str
+            determines what space is used can be {all|local|global}
+        reshape : bool
+            whether to reshape variable from vectorized view
+        Returns
+        -------
+        variable
+            shape == (samples,) + variable.shape
+        """
+        assert subset in {'all', 'local', 'global'}
+        if subset in {'all', 'local'}:
+            slc = self.view[name].slc
+        else:
+            s = self.view[name].slc.start
+            e = self.view[name].slc.stop
+            slc = slice(s - self.local_size, e - self.local_size)
+        _, _, shape, dtype = self.view[name]
+        view = space[:, slc]
+        if reshape:
+            view = view.reshape((space.shape[0],) + shape)
+        return view.astype(dtype)
+
+    def posterior_global(self, initial):
+        """
+        Implements posterior distribution from initial latent space
+
+        Parameters
+        ----------
+        initial : initial latent space shape(samples, size)
+
+        Returns
+        -------
+        global posterior space
+        """
+        raise NotImplementedError
+
+    def posterior_local(self, initial):
+        """
+        Implements posterior distribution from initial latent space
+
+        Parameters
+        ----------
+        initial : initial latent space shape(samples, size)
+
+        Returns
+        -------
+        local posterior space
+        """
+        if not self.local_vars:
+            return initial
+        mu, rho = self._local_mu_rho()
+        x = initial * rho2sd(rho) + mu
+        return x
+
+    def _local_mu_rho(self):
+        mu = []
+        rho = []
+        for var in self.local_vars:
+            mu.append(self.known[var][0].ravel())
+            rho.append(self.known[var][1].ravel())
+        mu = tt.concatenate(mu)
+        rho = tt.concatenate(rho)
+        return mu, rho
+
+    def to_flat_input(self, node):
+        """
+        Replaces vars with flattened view stored in self.input
+        """
+        return theano.clone(node, self.flat_view.replacements, strict=False)
+
+    def log_q_W_local(self, posterior):
+        """
+        log_q_W samples over q for local vars
+
+        Gradient wrt mu, rho in density parametrization
+        is set to zero to lower variance of ELBO
+        """
+        if not self.local_vars:
+            return 0
+        logp = []
+        for var in self.local_vars:
+            mu = self.known[var][0].ravel()
+            rho = self.known[var][1].ravel()
+            x = self.view_from(posterior, var.name, 'all', reshape=False)
+            q = log_normal(x, Z(mu), rho=Z(rho))
+            logp.append(self.weighted_logp(var, q))
+        samples = tt.sum(tt.concatenate(logp, axis=1), axis=1)
+        return samples
+
+    def log_q_W_global(self, posterior):
+        """
+        log_q_W samples over q for global vars
+        """
+        raise NotImplementedError
+
+    def log_q_W(self, posterior):
+        """
+        log_q_W samples over q
+        """
+        return self.log_q_W_global(posterior) + self.log_q_W_local(posterior)
+
+    def log_p_D(self, posterior):
+        """
+        log_p_D samples over q
+        """
+        _log_p_D_ = tt.sum(
+            list(map(self.weighted_logp, self.model.observed_RVs))
+        )
+        _log_p_D_ = self.to_flat_input(_log_p_D_)
+        samples = self.sample_over_space(posterior, _log_p_D_)
+        return samples
+
+    def weighted_logp(self, var, logp=None):
+        """
+        Weight logp according to given population size
+
+        Parameters
+        ----------
+        var : Random Variable
+        logp : None or given density for variable
+
+        Returns
+        -------
+        Tensor
+            weighted logp
+        """
+        tot = self.population.get(var)
+        if logp is None:
+            logp = var.logpt
+        if tot is not None:
+            tot = tt.as_tensor(tot)
+            logp = logp * tot / var.shape[0]
+        return logp
+
+    def KL_q_p_W(self, posterior):
+        """
+        KL(q||p) samples over q
+        """
+        return self.log_q_W(posterior) - self.log_p_W(posterior)
+
+    def log_p_W_local(self, posterior):
+        """
+        log_p_W_local samples over q
+        """
+        if not self.local_vars:
+            return 0
+        _log_p_W_local_ = tt.sum(
+            list(map(self.weighted_logp, self.local_vars))
+        )
+        _log_p_W_local_ = self.to_flat_input(_log_p_W_local_)
+        samples = self.sample_over_space(posterior, _log_p_W_local_)
+        return samples
+
+    def log_p_W_global(self, posterior):
+        """
+        log_p_W_global samples over q
+        """
+        def logp(var):
+            return var.logpt
+        _log_p_W_global_ = tt.sum(
+            list(map(logp, self.global_vars))
+        )
+        _log_p_W_global_ = self.to_flat_input(_log_p_W_global_)
+        samples = self.sample_over_space(posterior, _log_p_W_global_)
+        return samples
+
+    def log_p_W(self, posterior):
+        """
+        log_p_W samples over q
+        """
+        def logp(var):
+            return var.logpt
+        if self.local_vars:
+            _log_p_W_local_ = tt.sum(
+                list(map(self.weighted_logp, self.local_vars))
+            )
+        else:
+            _log_p_W_local_ = 0
+        _log_p_W_global_ = tt.sum(
+            list(map(logp, self.global_vars))
+        )
+        _log_p_W_ = _log_p_W_global_ + _log_p_W_local_
+        _log_p_W_ = self.to_flat_input(_log_p_W_)
+        samples = self.sample_over_space(posterior, _log_p_W_)
+        return samples
+
+    def apply_replacements(self, node, deterministic=False, include=None, exclude=None):
+        """
+        Replace variables in graph with variational approximation. By default, replaces all variables
+
+        Parameters
+        ----------
+        node : Variable
+            node for replacements
+        deterministic : bool
+            whether to use zeros as initial distribution
+            if True - zero initial point will produce constant latent variables
+        include : list
+            latent variables to be replaced
+        exclude : list
+            latent variables to be excluded for replacements
+
+        Returns
+        -------
+        node with replacements
+        """
+        if include is not None and exclude is not None:
+            raise ValueError('Only one parameter is supported {include|exclude}, got two')
+        if include is not None:
+            replacements = {k: v for k, v
+                            in self.flat_view.replacements.items() if k in include}
+        elif exclude is not None:
+            replacements = {k: v for k, v
+                            in self.flat_view.replacements.items() if k not in exclude}
+        else:
+            replacements = self.flat_view.replacements
+        node = theano.clone(node, replacements, strict=False)
+        posterior = self.posterior(zeros=deterministic)[0]
+        return theano.clone(node, {self.input: posterior})
+
+    def elbo(self, samples=1, pi_local=1, pi_global=1):
+        """
+        Evidence Lower Bound - ELBO
+
+        To fit variational posterior we need to maximize ELBO
+        wrt parameters
+
+        Parameters
+        ----------
+        samples : Tensor
+            number of samples
+        pi_local : Tensor
+            weight of local variational part
+        pi_global : Tensor
+            weight of global variational part
+
+        Returns
+        -------
+        ELBO samples
+        """
+        samples = tt.as_tensor(samples)
+        pi_local = tt.as_tensor(pi_local)
+        pi_global = tt.as_tensor(pi_global)
+        posterior = self.posterior(samples)
+        elbo = (self.log_p_D(posterior)
+                - pi_local * self.log_q_W_local(posterior)
+                - pi_global * self.log_q_W_global(posterior)
+                + pi_local * self.log_p_W_local(posterior)
+                + pi_global * self.log_p_W_global(posterior))
+        return elbo
+
+    @property
+    @memoize
+    @change_flags(compute_test_value='off')
+    def posterior_fn(self):
+        In = theano.In
+        samples = tt.iscalar('n_samples')
+        zeros = tt.bscalar('zeros')
+        posterior = self.posterior(samples, zeros)
+        fn = theano.function([In(samples, 'samples', 1, allow_downcast=True),
+                              In(zeros, 'zeros', 0, allow_downcast=True)],
+                             posterior)
+
+        def inner(samples=1, zeros=False):
+            return fn(samples, int(zeros))
+        return inner
+
+    def sample_vp(self, draws=1, zeros=False, hide_transformed=False):
+        if hide_transformed:
+            vars_sampled = [v_ for v_ in self.model.unobserved_RVs
+                            if not str(v_).endswith('_')]
+        else:
+            vars_sampled = [v_ for v_ in self.model.unobserved_RVs]
+        posterior = self.posterior_fn(draws, zeros)
+        names = [var.name for var in self.local_vars + self.global_vars]
+        samples = {name: self.view_from(posterior, name)
+                   for name in names}
+
+        def points():
+            for i in range(draws):
+                yield {name: samples[name][i]
+                       for name in names}
+
+        trace = pm.sampling.NDArray(model=self.model, vars=vars_sampled)
+        trace.setup(draws=draws, chain=0)
+        for point in points():
+            trace.record(point)
+        return pm.sampling.MultiTrace([trace])
+
+    @property
+    @memoize
+    def posterior_to_point_fn(self):
+        names_vars = [var.name for var in self.local_vars + self.global_vars]
+        names_point = names_vars + [var.name for var in self.model.deterministics]
+        point_fn = self.model.fastfn(self.local_vars + self.global_vars + self.model.deterministics)
+
+        def inner(posterior):
+            if posterior.shape[0] > 1:
+                raise ValueError('Posterior should have one sample')
+            point = {name: self.view_from(posterior, name)[0]
+                     for name in names_vars}
+            return dict(zip(names_point, point_fn(point)))
+        return inner
+
+    @property
+    def median(self):
+        posterior = self.posterior_fn(samples=1, zeros=True)
+        return self.posterior_to_point_fn(posterior)
+
+    def random(self):
+        posterior = self.posterior_fn(samples=1, zeros=False)
+        return self.posterior_to_point_fn(posterior)
+
+    @property
+    def local_vmap(self):
+        return self.order.vmap[:len(self.local_vars)]
+
+    @property
+    def global_vmap(self):
+        return self.order.vmap[len(self.local_vars):]
+
+    @property
+    def local_size(self):
+        size = sum([0] + [v.dsize for v in self.local_vars])
+        return size
+
+    @property
+    def global_size(self):
+        return self.total_size - self.local_size
+
+    @property
+    def total_size(self):
+        return self.order.dimensions
+
+    @property
+    def local_slc(self):
+        return slice(0, self.local_size)
+
+    @property
+    def global_slc(self):
+        return slice(self.local_size, self.total_size)
+
+
+class ADVI(BaseApproximation):
+    """ADVI algorithm as discribed in Kucukelbir[1]_ with
+    variance reduction [3]_. ADVI also supports Auto-Encoding
+    Variational Bayes described in [2]_ and minibatch training.
+
+    The main goal of advi is fast variational inference algorithm
+    that can be scaled on large problems. The limitation of this
+    approximation is that latent variables are assumed to be independent
+    and approximated with normal distribution. These assumptions don't hold
+    in regular practical task.
+
+    Parameters
+    ----------
+    population : dict[Variable->int]
+        maps observed_RV to its population size
+        if not provided defaults to full population
+        Note: population size is `shape[0]` of the whole data
+    known : dict[Variable->(mu, rho)]
+        maps local random variable to mu and rho for posterior parametrization
+        it is used for Autoencoding Variational Bayes
+        (AEVB; Kingma and Welling, 2014)[1]_
+    model : Model
+        PyMC3 model that's posterior is going to be approximated
+
+    References
+    ----------
+    .. [1] Kingma, D. P., & Welling, M. (2014).
+      Auto-Encoding Variational Bayes. stat, 1050, 1.
+
+    .. [2] Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
+      Automatic variational inference in Stan. In Advances in neural
+      information processing systems (pp. 568-576).
+
+    .. [3] Geoffrey, R., Yuhuai, W., David, D. (2016)
+        Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI.
+        NIPS Workshop
+    """
+    def create_shared_params(self):
+        return {'mu': theano.shared(
+                    self.input.tag.test_value[self.global_slc]),
+                'rho': theano.shared(
+                    np.zeros((self.global_size,), dtype=theano.config.floatX))}
+
+    def posterior_global(self, initial):
+        sd = rho2sd(self.shared_params['rho'])
+        mu = self.shared_params['mu']
+        return sd * initial + mu
+
+    def log_q_W_global(self, posterior):
+        """
+        log_q_W samples over q for global vars
+
+        Gradient wrt mu, rho in density parametrization
+        is set to zero to lower variance of ELBO
+        """
+        mu = self.shared_params['mu']
+        rho = self.shared_params['rho']
+        samples = tt.sum(log_normal(posterior[:, self.global_slc], Z(mu), rho=Z(rho)), axis=1)
+        return samples
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
new file mode 100644
index 0000000000..48bbb8b41c
--- /dev/null
+++ b/pymc3/variational/inference.py
@@ -0,0 +1,147 @@
+import numpy as np
+import theano
+import pymc3 as pm
+from .approximations import ADVI, BaseApproximation
+from .advi import adagrad_optimizer
+from tqdm import trange
+
+
+APPROXIMATIONS = {
+    'advi': ADVI,
+}
+
+
+def approximate(n=10000, population=None, local_vars=None,
+                optimizer=None, method='advi',
+                samples=1, pi_local=1, pi_global=1,
+                callbacks=None, learning_rate=.001, epsilon=.1,
+                model=None, more_params=None, more_updates=None,
+                *args, **kwargs):
+    """
+    Interface for efficient variational inference with gradient
+    variance reduction as described in [4]_
+
+
+    PyMC3 supports the following variational inference methods:
+        1) Automatic differentiation variational inference (ADVI) [2]_
+
+    Parameters
+    ----------
+    n : int
+        number of training iterations
+    population : dict[Variable->int]
+        maps observed_RV to its population size
+        if not provided defaults to full population
+        Note: population size is `shape[0]` of the whole data
+    local_vars : dict[Variable->(mu, rho)]
+        maps local random variable to mu and rho for posterior
+        parametrization, it is used for Autoencoding Variational Bayes
+        (AEVB; Kingma and Welling, 2014)[1]_
+    optimizer : callable
+        optional custom optimizer to be called in the following way:
+            :code:`updates = optimizer(-elbo, list_of_params)`
+    method : str|Approximation
+        string description of approximation to be used or
+        Approximation instance used to calculate elbo and provide
+        for shared params
+    samples : int|Tensor
+        number of Monte Carlo samples used for approximation,
+        defaults to 1
+    pi_local : float|Tensor
+        pi_local in [0;1] reweighting constant for local KL divergence
+        this trick was described in [3]_ for fine-tuning minibatch ADVI
+    pi_global : float|Tensor
+        pi_global in [0;1] reweighting constant for global KL divergence
+        this trick was described in [3]_ for fine-tuning minibatch ADVI
+    callbacks : list[callable]
+        callables that will be called in the following way
+            :code:`callback(Approximation, elbo_history, i)`
+    learning_rate : float
+        learning rate for adagrad optimizer,
+        it will be ignored if optimizer is passed
+    epsilon : float - epsilon for adagrad optimizer,
+        it will be ignored if optimizer is passed
+    model : Model
+        Probabilistic model if function is called out of context
+    more_params : list
+        optional more parameters for computing gradients
+    more_updates : dict
+        more updates are included in step function
+    args : additional args that will be passed to Approximation
+    kwargs : additional kwargs that will be passed to Approximation
+
+    Returns
+    -------
+    (Approximation, elbo_history)
+
+    Notes
+    -----
+    Remember that you can manipulate pi and number of samples with callbacks
+
+    References
+    ----------
+    .. [1] Kingma, D. P., & Welling, M. (2014).
+      Auto-Encoding Variational Bayes. stat, 1050, 1.
+
+    .. [2] Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
+      Automatic variational inference in Stan. In Advances in neural
+      information processing systems (pp. 568-576).
+
+    .. [3] Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015).
+      Weight Uncertainty in Neural Network. In Proceedings of the 32nd
+      International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
+
+    .. [4] Geoffrey, R., Yuhuai, W., David, D. (2016)
+        Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI.
+        NIPS Workshop
+    """
+    if isinstance(method, BaseApproximation):
+        approx = method
+    else:
+        approx = APPROXIMATIONS[method](known=local_vars, population=population,
+                                        model=model, *args, **kwargs)
+    if callbacks is None:
+        callbacks = []
+    if optimizer is None:
+        optimizer = adagrad_optimizer(learning_rate, epsilon)
+    elbo = approx.elbo(samples=samples,
+                       pi_local=pi_local,
+                       pi_global=pi_global).mean()
+    params = approx.params
+    if more_params is not None:
+        params += more_params
+    updates = optimizer(-elbo, params)
+    if more_updates is not None:
+        more_updates.update(updates)
+        updates = more_updates
+    step = theano.function([], elbo, updates=updates)
+    i = 0
+    elbos = np.empty(n)
+    try:
+        progress = trange(n)
+        for i in progress:
+            e = step()
+            elbos[i] = e
+            for callback in callbacks:
+                callback(approx, elbos[:i+1], i)
+            if n < 10:
+                progress.set_description('ELBO = {:,.5g}'.format(elbos[i]))
+            elif i % (n // 10) == 0 and i > 0:
+                avg_elbo = elbos[i - n // 10:i].mean()
+                progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo))
+    except KeyboardInterrupt:
+        elbos = elbos[:i]
+        if n < 10:
+            pm._log.info('Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format(
+                i, 100 * i // n, elbos[i]))
+        else:
+            avg_elbo = elbos[i - n // 10:].mean()
+            pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format(
+                i, 100 * i // n, avg_elbo))
+    else:
+        if n < 10:
+            pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1]))
+        else:
+            avg_elbo = elbos[-n // 10:].mean()
+            pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))
+    return approx, elbos