diff --git a/pymc3/data.py b/pymc3/data.py index 944ce55458..fcac737719 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -1,7 +1,10 @@ +import itertools import pkgutil import io -__all__ = ['get_data_file'] +import theano.tensor as tt + +__all__ = ['get_data_file', 'DataGenerator'] def get_data_file(pkg, path): @@ -19,3 +22,28 @@ def get_data_file(pkg, path): """ return io.BytesIO(pkgutil.get_data(pkg, path)) + + +class DataGenerator(object): + """ + Helper class that helps to infer data type of generator with looking + at the first item, preserving the order of the resulting generator + """ + def __init__(self, generator): + self.test_value = next(generator) + self.gen = itertools.chain([self.test_value], generator) + self.tensortype = tt.TensorType( + self.test_value.dtype, + ((False, ) * self.test_value.ndim)) + + def __next__(self): + return next(self.gen) + + def __iter__(self): + return self + + def __eq__(self, other): + return id(self) == id(other) + + def __hash__(self): + return hash(id(self)) diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py index 90ae896529..0c524fda4c 100644 --- a/pymc3/distributions/dist_math.py +++ b/pymc3/distributions/dist_math.py @@ -10,6 +10,9 @@ from .special import gammaln, multigammaln +c = - 0.5 * np.log(2 * np.pi) + + def bound(logp, *conditions, **kwargs): """ Bounds a log probability density with several conditions. @@ -95,3 +98,64 @@ def i1(x): x**9 / 1474560 + x**11 / 176947200 + x**13 / 29727129600, np.e**x / (2 * np.pi * x)**0.5 * (1 - 3 / (8 * x) + 15 / (128 * x**2) + 315 / (3072 * x**3) + 14175 / (98304 * x**4))) + + +def sd2rho(sd): + """ + `sd -> rho` theano converter + + :math:`mu + sd*e = mu + log(1+exp(rho))*e`""" + return tt.log(tt.exp(sd) - 1) + + +def rho2sd(rho): + """ + `rho -> sd` theano converter + + :math:`mu + sd*e = mu + log(1+exp(rho))*e`""" + return tt.log1p(tt.exp(rho)) + + +def log_normal(x, mean, **kwargs): + """ + Calculate logarithm of normal distribution at point `x` + with given `mean` and `std` + + Parameters + ---------- + x : Tensor + point of evaluation + mean : Tensor + mean of normal distribution + kwargs : one of parameters `{sd, tau, w, rho}` + + Notes + ----- + There are four variants for density parametrization. + They are: + 1) standard deviation - `std` + 2) `w`, logarithm of `std` :math:`w = log(std)` + 3) `rho` that follows this equation :math:`rho = log(exp(std) - 1)` + 4) `tau` that follows this equation :math:`tau = std^{-1}` + ---- + """ + sd = kwargs.get('sd') + w = kwargs.get('w') + rho = kwargs.get('rho') + tau = kwargs.get('tau') + eps = kwargs.get('eps', 0.0) + check = sum(map(lambda a: a is not None, [sd, w, rho, tau])) + if check > 1: + raise ValueError('more than one required kwarg is passed') + if check == 0: + raise ValueError('none of required kwarg is passed') + if sd is not None: + std = sd + elif w is not None: + std = tt.exp(w) + elif rho is not None: + std = rho2sd(rho) + else: + std = tau**(-1) + std += eps + return c - tt.log(tt.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2) diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py index 2ad2243f74..2a38b91aea 100644 --- a/pymc3/distributions/distribution.py +++ b/pymc3/distributions/distribution.py @@ -8,7 +8,7 @@ from .dist_math import bound -__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Bound', +__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Bound', 'Discrete', 'NoDistribution', 'TensorType', 'draw_values'] @@ -30,8 +30,9 @@ def __new__(cls, name, *args, **kwargs): if isinstance(name, string_types): data = kwargs.pop('observed', None) + total_size = kwargs.pop('total_size', None) dist = cls.dist(*args, **kwargs) - return model.Var(name, dist, data) + return model.Var(name, dist, data, total_size) else: raise TypeError("Name needs to be a string but got: %s" % name) @@ -410,7 +411,7 @@ def __init__(self, distribution, lower, upper, transform='infer', *args, **kwarg self.transform = transforms.upperbound(upper) if default >= upper: self.testval = upper - 1 - + if issubclass(distribution, Discrete): self.transform = None diff --git a/pymc3/math.py b/pymc3/math.py index 97c6d92b50..adb34bef7a 100644 --- a/pymc3/math.py +++ b/pymc3/math.py @@ -22,3 +22,7 @@ def invlogit(x, eps=sys.float_info.epsilon): def logit(p): return tt.log(p / (1 - p)) + + +def flatten_list(tensors): + return tt.concatenate([var.ravel() for var in tensors]) diff --git a/pymc3/model.py b/pymc3/model.py index 6017484d06..8c480c87e0 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -8,7 +8,7 @@ import pymc3 as pm from .memoize import memoize -from .theanof import gradient, hessian, inputvars +from .theanof import gradient, hessian, inputvars, generator from .vartypes import typefilter, discrete_types, continuous_types from .blocking import DictToArrayBijection, ArrayOrdering @@ -458,7 +458,7 @@ def cont_vars(self): """All the continuous variables in the model""" return list(typefilter(self.vars, continuous_types)) - def Var(self, name, dist, data=None): + def Var(self, name, dist, data=None, total_size=None): """Create and add (un)observed random variable to the model with an appropriate prior distribution. @@ -469,6 +469,8 @@ def Var(self, name, dist, data=None): data : array_like (optional) If data is provided, the variable is observed. If None, the variable is unobserved. + total_size : scalar + upscales logp of variable with :math:`coef = total_size/var.shape[0]` Returns ------- @@ -477,11 +479,13 @@ def Var(self, name, dist, data=None): name = self.name_for(name) if data is None: if getattr(dist, "transform", None) is None: - var = FreeRV(name=name, distribution=dist, model=self) + var = FreeRV(name=name, distribution=dist, model=self, + total_size=total_size) self.free_RVs.append(var) else: var = TransformedRV(name=name, distribution=dist, model=self, - transform=dist.transform) + transform=dist.transform, + total_size=total_size) pm._log.debug('Applied {transform}-transform to {name}' ' and added transformed {orig_name} to model.'.format( transform=dist.transform.name, @@ -491,7 +495,7 @@ def Var(self, name, dist, data=None): return var elif isinstance(data, dict): var = MultiObservedRV(name=name, data=data, distribution=dist, - model=self) + model=self, total_size=total_size) self.observed_RVs.append(var) if var.missing_values: self.free_RVs += var.missing_values @@ -500,7 +504,8 @@ def Var(self, name, dist, data=None): self.named_vars[v.name] = v else: var = ObservedRV(name=name, data=data, - distribution=dist, model=self) + distribution=dist, model=self, + total_size=total_size) self.observed_RVs.append(var) if var.missing_values: self.free_RVs.append(var.missing_values) @@ -717,7 +722,7 @@ class FreeRV(Factor, TensorVariable): """Unobserved random variable that a model is specified in terms of.""" def __init__(self, type=None, owner=None, index=None, name=None, - distribution=None, model=None): + distribution=None, model=None, total_size=None): """ Parameters ---------- @@ -725,7 +730,10 @@ def __init__(self, type=None, owner=None, index=None, name=None, owner : theano owner (optional) name : str distribution : Distribution - model : Model""" + model : Model + total_size : scalar Tensor (optional) + needed for upscaling logp + """ if type is None: type = distribution.type super(FreeRV, self).__init__(type, owner, index, name) @@ -736,7 +744,14 @@ def __init__(self, type=None, owner=None, index=None, name=None, self.distribution = distribution self.tag.test_value = np.ones( distribution.shape, distribution.dtype) * distribution.default() - self.logp_elemwiset = distribution.logp(self) + logp_elemwiset = distribution.logp(self) + if total_size is None: + coef = tt.as_tensor(1) + else: + assert logp_elemwiset.ndim >= 1, ('Variable with scaled density ' + 'needs to be at least 1 dimensional') + coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0] + self.logp_elemwiset = logp_elemwiset * coef self.model = model incorporate_methods(source=distribution, destination=self, @@ -759,6 +774,8 @@ def pandas_to_array(data): return data elif isinstance(data, theano.gof.graph.Variable): return data + elif hasattr(data, '__next__'): + return generator(data) else: return np.asarray(data) @@ -792,7 +809,7 @@ class ObservedRV(Factor, TensorVariable): """ def __init__(self, type=None, owner=None, index=None, name=None, data=None, - distribution=None, model=None): + distribution=None, model=None, total_size=None): """ Parameters ---------- @@ -801,6 +818,8 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None, name : str distribution : Distribution model : Model + total_size : scalar Tensor (optional) + needed for upscaling logp """ from .distributions import TensorType if type is None: @@ -814,7 +833,14 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None, data = as_tensor(data, name, model, distribution) self.missing_values = data.missing_values - self.logp_elemwiset = distribution.logp(data) + logp_elemwiset = distribution.logp(data) + if total_size is None: + coef = tt.as_tensor(1) + else: + assert logp_elemwiset.ndim >= 1, ('Variable with scaled density ' + 'needs to be at least 1 dimensional') + coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0] + self.logp_elemwiset = logp_elemwiset * coef self.model = model self.distribution = distribution @@ -835,7 +861,7 @@ class MultiObservedRV(Factor): Potentially partially observed. """ - def __init__(self, name, data, distribution, model): + def __init__(self, name, data, distribution, model, total_size=None): """ Parameters ---------- @@ -844,6 +870,8 @@ def __init__(self, name, data, distribution, model): name : str distribution : Distribution model : Model + total_size : scalar Tensor (optional) + needed for upscaling logp """ self.name = name self.data = {name: as_tensor(data, name, model, distribution) @@ -851,7 +879,14 @@ def __init__(self, name, data, distribution, model): self.missing_values = [datum.missing_values for datum in self.data.values() if datum.missing_values is not None] - self.logp_elemwiset = distribution.logp(**self.data) + logp_elemwiset = distribution.logp(**self.data) + if total_size is None: + coef = tt.as_tensor(1) + else: + assert logp_elemwiset.ndim >= 1, ('Variable with scaled density ' + 'needs to be at least 1 dimensional') + coef = tt.as_tensor(total_size) / logp_elemwiset.shape[0] + self.logp_elemwiset = logp_elemwiset * coef self.model = model self.distribution = distribution @@ -896,17 +931,20 @@ def Potential(name, var, model=None): class TransformedRV(TensorVariable): def __init__(self, type=None, owner=None, index=None, name=None, - distribution=None, model=None, transform=None): + distribution=None, model=None, transform=None, + total_size=None): """ Parameters ---------- type : theano type (optional) owner : theano owner (optional) - name : str distribution : Distribution - model : Model""" + model : Model + total_size : scalar Tensor (optional) + needed for upscaling logp + """ if type is None: type = distribution.type super(TransformedRV, self).__init__(type, owner, index, name) @@ -916,7 +954,7 @@ def __init__(self, type=None, owner=None, index=None, name=None, transformed_name = "{}_{}_".format(name, transform.name) self.transformed = model.Var( - transformed_name, transform.apply(distribution)) + transformed_name, transform.apply(distribution), total_size=total_size) normalRV = transform.backward(self.transformed) diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py index 4eb96e6789..0b9696a4ae 100644 --- a/pymc3/tests/test_model.py +++ b/pymc3/tests/test_model.py @@ -1,5 +1,5 @@ import unittest -import theano.tensor as tt +from theano import theano, tensor as tt import pymc3 as pm from pymc3.distributions import HalfCauchy, Normal from pymc3 import Potential, Deterministic @@ -118,3 +118,12 @@ def test_model_root(self): self.assertTrue(model is model.root) with pm.Model() as sub: self.assertTrue(model is sub.root) + + def test_density_scaling(self): + with pm.Model() as model1: + Normal('n', observed=[[1]], total_size=1) + p1 = theano.function([], model1.logpt) + with pm.Model() as model2: + Normal('n', observed=[[1]], total_size=2) + p2 = theano.function([], model2.logpt) + self.assertEqual(p1() * 2, p2()) diff --git a/pymc3/tests/test_theanof.py b/pymc3/tests/test_theanof.py new file mode 100644 index 0000000000..ebc7308781 --- /dev/null +++ b/pymc3/tests/test_theanof.py @@ -0,0 +1,50 @@ +import itertools +import unittest +import numpy as np +import theano +from ..theanof import DataGenerator, GeneratorOp, generator + + +def integers(): + i = 0 + while True: + yield np.float32(i) + i += 1 + + +def integers_ndim(ndim): + i = 0 + while True: + yield np.ones((2,) * ndim) * i + i += 1 + + +class TestGenerator(unittest.TestCase): + def test_basic(self): + generator = DataGenerator(integers()) + gop = GeneratorOp(generator)() + self.assertEqual(gop.tag.test_value, np.float32(0)) + f = theano.function([], gop) + self.assertEqual(f(), np.float32(0)) + self.assertEqual(f(), np.float32(1)) + for i in range(2, 100): + f() + self.assertEqual(f(), np.float32(100)) + + def test_ndim(self): + for ndim in range(10): + res = list(itertools.islice(integers_ndim(ndim), 0, 2)) + generator = DataGenerator(integers_ndim(ndim)) + gop = GeneratorOp(generator)() + f = theano.function([], gop) + self.assertEqual(ndim, res[0].ndim) + np.testing.assert_equal(f(), res[0]) + np.testing.assert_equal(f(), res[1]) + + def test_cloning_available(self): + gop = generator(integers()) + res = gop ** 2 + shared = theano.shared(np.float32(10)) + res1 = theano.clone(res, {gop: shared}) + f = theano.function([], res1) + self.assertEqual(f(), np.float32(100)) diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py new file mode 100644 index 0000000000..0b75a69b6a --- /dev/null +++ b/pymc3/tests/test_variational_inference.py @@ -0,0 +1,112 @@ +import unittest +import numpy as np +import theano +import theano.tensor as tt +from theano.sandbox.rng_mrg import MRG_RandomStreams +import pymc3 as pm +from pymc3 import Model, Normal +from pymc3.variational.approximations import ADVI +from pymc3.variational.inference import approximate +from ..theanof import set_tt_rng +from . import models +set_tt_rng(MRG_RandomStreams(42)) + + +class TestApproximates: + class Base(unittest.TestCase): + approx = None + NITER = 30000 + + def test_elbo(self): + if self.approx is not ADVI: + raise unittest.SkipTest + mu0 = 1.5 + sigma = 1.0 + y_obs = np.array([1.6, 1.4]) + + post_mu = np.array([1.88]) + post_sd = np.array([1]) + # Create a model for test + with Model() as model: + mu = Normal('mu', mu=mu0, sd=sigma) + Normal('y', mu=mu, sd=1, observed=y_obs) + + # Create variational gradient tensor + mean_field = self.approx(model=model) + elbo = mean_field.elbo(samples=10000) + + mean_field.shared_params['mu'].set_value(post_mu) + mean_field.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1)) + + f = theano.function([], elbo.mean()) + elbo_mc = f() + + # Exact value + elbo_true = (-0.5 * ( + 3 + 3 * post_mu**2 - 2 * (y_obs[0] + y_obs[1] + mu0) * post_mu + + y_obs[0]**2 + y_obs[1]**2 + mu0**2 + 3 * np.log(2 * np.pi)) + + 0.5 * (np.log(2 * np.pi) + 1)) + np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1) + + def test_vary_samples(self): + _, model, _ = models.simple_model() + i = tt.iscalar('i') + i.tag.test_value = 1 + with model: + mf = self.approx() + elbo = mf.elbo(i) + elbos = theano.function([i], elbo) + self.assertEqual(elbos(1).shape[0], 1) + self.assertEqual(elbos(10).shape[0], 10) + + def test_vars_view(self): + _, model, _ = models.multidimensional_model() + with model: + mf = self.approx() + posterior = mf.posterior(10) + x_sampled = mf.view_from(posterior, 'x').eval() + self.assertEqual(x_sampled.shape, (10,) + model['x'].dshape) + + def test_sample_vp(self): + n_samples = 100 + xs = np.random.binomial(n=1, p=0.2, size=n_samples) + with pm.Model(): + p = pm.Beta('p', alpha=1, beta=1) + pm.Binomial('xs', n=1, p=p, observed=xs) + mf = self.approx() + trace = mf.sample_vp(draws=1, hide_transformed=True) + self.assertListEqual(trace.varnames, ['p']) + trace = mf.sample_vp(draws=1, hide_transformed=False) + self.assertListEqual(sorted(trace.varnames), ['p', 'p_logodds_']) + + def test_advi_optimizer(self): + n = 1000 + sd0 = 2. + mu0 = 4. + sd = 3. + mu = -5. + + data = sd * np.random.randn(n) + mu + + d = n / sd ** 2 + 1 / sd0 ** 2 + mu_post = (n * np.mean(data) / sd ** 2 + mu0 / sd0 ** 2) / d + + with Model(): + mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0) + Normal('x', mu=mu_, sd=sd, observed=data) + pm.Deterministic('mu_sq', mu_**2) + mf = self.approx() + approximate(self.NITER, method=mf) + trace = mf.sample_vp(10000) + + np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1) + np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4) + np.testing.assert_allclose(mf.median['mu'], mu_post, rtol=0.1) + np.testing.assert_allclose(mf.median['mu_sq'], mu_post**2, rtol=0.1) + + +class TestMeanField(TestApproximates.Base): + approx = ADVI + +if __name__ == '__main__': + unittest.main() diff --git a/pymc3/theanof.py b/pymc3/theanof.py index 56bad3fb0e..ae8648d926 100644 --- a/pymc3/theanof.py +++ b/pymc3/theanof.py @@ -1,15 +1,47 @@ import numpy as np +import contextlib from .vartypes import typefilter, continuous_types from theano import theano, scalar, tensor as tt from theano.gof.graph import inputs +from theano.configparser import change_flags +from theano.tensor import TensorVariable +from theano.gof import Op +from theano.configparser import change_flags from .memoize import memoize from .blocking import ArrayOrdering - -__all__ = ['gradient', 'hessian', 'hessian_diag', 'inputvars', - 'cont_inputs', 'floatX', 'jacobian', - 'CallableTensor', 'join_nonshared_inputs', - 'make_shared_replacements'] +from theano.sandbox.rng_mrg import MRG_RandomStreams +from .data import DataGenerator + + +__all__ = ['gradient', + 'hessian', + 'hessian_diag', + 'inputvars', + 'cont_inputs', + 'jacobian', + 'CallableTensor', + 'join_nonshared_inputs', + 'make_shared_replacements', + 'tt_rng', + 'set_tt_rng', + 'no_test_val', + 'change_flags', + 'generator'] + + +@contextlib.contextmanager +def no_test_val(): + """for functions use theano.configparser.change_flags + + Usage + ----- + >>> with no_test_val(): + ... + """ + theano.config.compute_test_value = 'off' + yield + theano.config.compute_test_value = 'raise' def inputvars(a): @@ -242,3 +274,86 @@ def __call__(self, input): scalar_identity = IdentityOp(scalar.upgrade_to_float, name='scalar_identity') identity = tt.Elemwise(scalar_identity, name='identity') + + +@change_flags(compute_test_value='off') +def launch_rng(rng): + """Helper function for safe launch of rng. + If not launched, there will be problems with test_value + + Parameters + ---------- + rng : theano.sandbox.rng_mrg.MRG_RandomStreams` instance + """ + state = rng.rstate + rng.inc_rstate() + rng.set_rstate(state) + +_tt_rng = MRG_RandomStreams() +launch_rng(_tt_rng) + + +def tt_rng(): + """Get the package-level random number generator. + Returns + ------- + `theano.sandbox.rng_mrg.MRG_RandomStreams` instance + `theano.sandbox.rng_mrg.MRG_RandomStreams` + instance passed to the most recent call of `set_tt_rng` + """ + return _tt_rng + + +def set_tt_rng(new_rng): + """Set the package-level random number generator. + Parameters + ---------- + new_rng : `theano.sandbox.rng_mrg.MRG_RandomStreams` instance + The random number generator to use. + """ + global _tt_rng + _tt_rng = new_rng + launch_rng(_tt_rng) + + +class GeneratorOp(Op): + """ + Generaror Op is designed for storing python generators + inside theano graph. The main limitation is generator itself. + + There are some important cases when generator becomes exhausted + - not endless generator is passed + - exception is raised while `generator.__next__` is performed + Note: it is dangerous in simple python generators, but ok in + custom class based generators with explicit state + - data type on each iteration should be the same + """ + __props__ = ('generator',) + + def __init__(self, gen): + if not isinstance(gen, DataGenerator): + gen = DataGenerator(gen) + super(GeneratorOp, self).__init__() + self.generator = gen + self.itypes = [] + self.otypes = [self.generator.tensortype] + + def perform(self, node, inputs, output_storage, params=None): + try: + output_storage[0][0] = self.generator.__next__() + except StopIteration: + output_storage[0][0] = np.nan + + def do_constant_folding(self, node): + return False + + @change_flags(compute_test_value='off') + def __call__(self, *args, **kwargs): + rval = super(GeneratorOp, self).__call__(*args, **kwargs) + rval.tag.test_value = self.generator.test_value + return rval + + +def generator(gen): + """shortcut for `GeneratorOp`""" + return GeneratorOp(gen)() diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py new file mode 100644 index 0000000000..42caf4bd72 --- /dev/null +++ b/pymc3/variational/approximations.py @@ -0,0 +1,597 @@ +import collections +import theano.tensor as tt +from theano.ifelse import ifelse +import theano +import pymc3 as pm +from ..theanof import tt_rng, memoize, change_flags +from ..blocking import ArrayOrdering +from ..distributions.dist_math import rho2sd, log_normal +from ..math import flatten_list +from ..model import modelcontext +import numpy as np + +# helper class +FlatView = collections.namedtuple('FlatView', 'input, replacements') +# shortcut for zero grad +Z = theano.gradient.zero_grad + + +class BaseApproximation(object): + """ + Base class for variational methods + + It uses the idea that we can map initial standard normal + distribution to the approximate posterior distribution. + + Three methods are required for implementation: + 1) creating shared parameters `create_shared_params()` + 2) mapping initial distribution to posterior `posterior{_{global|local}}(initial)` + 3) calculating log_q_W for approximation `log_q_W{_{global|local}}(posterior)` + + Then ELBO can be calculated for given approximation + `log_p_D(posterior) - KL_q_p_W(posterior)` + + Parameters + ---------- + population : dict[Variable->int] + maps observed_RV to its population size + if not provided defaults to full population + Note: population size is `shape[0]` of the whole data + known : dict[Variable->(mu, rho)] + maps local random variable to mu and rho for posterior parametrization + it is used for Autoencoding Variational Bayes + (AEVB; Kingma and Welling, 2014)[1]_ + model : Model + PyMC3 model which posterior is going to be approximated + + References + ---------- + .. [1] Kingma, D. P., & Welling, M. (2014). + Auto-Encoding Variational Bayes. stat, 1050, 1. + """ + def __init__(self, population=None, known=None, model=None): + model = modelcontext(model) + self.check_model(model) + if known is None: + known = dict() + if population is None: + population = dict() + self.population = population + self.model = model + + def get_transformed(v): + if hasattr(v, 'transformed'): + return v.transformed + return v + known = {get_transformed(k): v for k, v in known.items()} + self.known = known + self.local_vars = [v for v in model.free_RVs if v in known] + self.global_vars = [v for v in model.free_RVs if v not in known] + self.order = ArrayOrdering(self.local_vars + self.global_vars) + inputvar = tt.vector('flat_view', dtype=theano.config.floatX) + inputvar.tag.test_value = flatten_list(self.local_vars + self.global_vars).tag.test_value + replacements = {self.model.named_vars[name]: inputvar[slc].reshape(shape).astype(dtype) + for name, slc, shape, dtype in self.order.vmap} + self.flat_view = FlatView(inputvar, replacements) + self.view = {vm.var: vm for vm in self.order.vmap} + self.shared_params = self.create_shared_params() + + def set_params(self, params): + self.shared_params.update(params) + + @property + def params(self): + return list(self.shared_params.values()) + + @staticmethod + def check_model(model): + """ + Checks that model is valid for variational inference + """ + vars_ = [var for var in model.vars if not isinstance(var, pm.model.ObservedRV)] + if any([var.dtype in pm.discrete_types for var in vars_]): + raise ValueError('Model should not include discrete RVs') + + @property + def input(self): + """ + Shortcut to flattened input + """ + return self.flat_view.input + + def create_shared_params(self): + """ + Any stuff you need will be here + + Returns + ------- + dict : shared params + """ + raise NotImplementedError + + def initial(self, samples, subset='all', zeros=False): + """ + Initial distribution for constructing posterior + + Parameters + ---------- + samples : int - number of samples + subset : determines what space is used can be {all|local|global} + zeros : bool - return zeros if True + + Returns + ------- + Tensor + sampled latent space shape(samples, size) + """ + assert subset in {'all', 'local', 'global'} + if subset == 'all': + size = self.total_size + elif subset == 'global': + size = self.global_size + else: + size = self.local_size + shape = tt.stack([tt.as_tensor(samples), tt.as_tensor(size)]) + if isinstance(zeros, bool): + zeros = int(zeros) + zeros = tt.as_tensor(zeros) + space = ifelse(zeros, + tt.zeros(shape), + tt_rng().normal(shape)) + return space + + def posterior(self, samples=1, zeros=False): + """ + Transforms initial latent space to posterior distribution + + Parameters + ---------- + samples : number of samples + zeros : set initial distribution to zeros + + Returns + ------- + Tensor + posterior space + """ + if self.local_vars and self.global_vars: + return tt.concatenate([ + self.posterior_local(self.initial(samples, 'local', zeros)), + self.posterior_global(self.initial(samples, 'global', zeros)) + ], axis=1) + elif self.local_vars: + return self.posterior_local(self.initial(samples, 'local', zeros)) + elif self.global_vars: + return self.posterior_global(self.initial(samples, 'global', zeros)) + else: + raise ValueError('No FreeVARs in model') + + def sample_over_space(self, space, node): + """ + Take samples of node using `space` as distribution of random variables + + Parameters + ---------- + space : space to sample over + node : node that has flattened input + + Returns + ------- + samples + """ + def replace_node(post): + return theano.clone(node, {self.input: post}) + samples, _ = theano.scan(replace_node, space, n_steps=space.shape[0]) + return samples + + def view_from(self, space, name, subset='all', reshape=True): + """ + Construct view on a variable from flattened `space` + + Parameters + ---------- + space : space to take view of variable from + name : str + name of variable + subset : str + determines what space is used can be {all|local|global} + reshape : bool + whether to reshape variable from vectorized view + Returns + ------- + variable + shape == (samples,) + variable.shape + """ + assert subset in {'all', 'local', 'global'} + if subset in {'all', 'local'}: + slc = self.view[name].slc + else: + s = self.view[name].slc.start + e = self.view[name].slc.stop + slc = slice(s - self.local_size, e - self.local_size) + _, _, shape, dtype = self.view[name] + view = space[:, slc] + if reshape: + view = view.reshape((space.shape[0],) + shape) + return view.astype(dtype) + + def posterior_global(self, initial): + """ + Implements posterior distribution from initial latent space + + Parameters + ---------- + initial : initial latent space shape(samples, size) + + Returns + ------- + global posterior space + """ + raise NotImplementedError + + def posterior_local(self, initial): + """ + Implements posterior distribution from initial latent space + + Parameters + ---------- + initial : initial latent space shape(samples, size) + + Returns + ------- + local posterior space + """ + if not self.local_vars: + return initial + mu, rho = self._local_mu_rho() + x = initial * rho2sd(rho) + mu + return x + + def _local_mu_rho(self): + mu = [] + rho = [] + for var in self.local_vars: + mu.append(self.known[var][0].ravel()) + rho.append(self.known[var][1].ravel()) + mu = tt.concatenate(mu) + rho = tt.concatenate(rho) + return mu, rho + + def to_flat_input(self, node): + """ + Replaces vars with flattened view stored in self.input + """ + return theano.clone(node, self.flat_view.replacements, strict=False) + + def log_q_W_local(self, posterior): + """ + log_q_W samples over q for local vars + + Gradient wrt mu, rho in density parametrization + is set to zero to lower variance of ELBO + """ + if not self.local_vars: + return 0 + logp = [] + for var in self.local_vars: + mu = self.known[var][0].ravel() + rho = self.known[var][1].ravel() + x = self.view_from(posterior, var.name, 'all', reshape=False) + q = log_normal(x, Z(mu), rho=Z(rho)) + logp.append(self.weighted_logp(var, q)) + samples = tt.sum(tt.concatenate(logp, axis=1), axis=1) + return samples + + def log_q_W_global(self, posterior): + """ + log_q_W samples over q for global vars + """ + raise NotImplementedError + + def log_q_W(self, posterior): + """ + log_q_W samples over q + """ + return self.log_q_W_global(posterior) + self.log_q_W_local(posterior) + + def log_p_D(self, posterior): + """ + log_p_D samples over q + """ + _log_p_D_ = tt.sum( + list(map(self.weighted_logp, self.model.observed_RVs)) + ) + _log_p_D_ = self.to_flat_input(_log_p_D_) + samples = self.sample_over_space(posterior, _log_p_D_) + return samples + + def weighted_logp(self, var, logp=None): + """ + Weight logp according to given population size + + Parameters + ---------- + var : Random Variable + logp : None or given density for variable + + Returns + ------- + Tensor + weighted logp + """ + tot = self.population.get(var) + if logp is None: + logp = var.logpt + if tot is not None: + tot = tt.as_tensor(tot) + logp = logp * tot / var.shape[0] + return logp + + def KL_q_p_W(self, posterior): + """ + KL(q||p) samples over q + """ + return self.log_q_W(posterior) - self.log_p_W(posterior) + + def log_p_W_local(self, posterior): + """ + log_p_W_local samples over q + """ + if not self.local_vars: + return 0 + _log_p_W_local_ = tt.sum( + list(map(self.weighted_logp, self.local_vars)) + ) + _log_p_W_local_ = self.to_flat_input(_log_p_W_local_) + samples = self.sample_over_space(posterior, _log_p_W_local_) + return samples + + def log_p_W_global(self, posterior): + """ + log_p_W_global samples over q + """ + def logp(var): + return var.logpt + _log_p_W_global_ = tt.sum( + list(map(logp, self.global_vars)) + ) + _log_p_W_global_ = self.to_flat_input(_log_p_W_global_) + samples = self.sample_over_space(posterior, _log_p_W_global_) + return samples + + def log_p_W(self, posterior): + """ + log_p_W samples over q + """ + def logp(var): + return var.logpt + if self.local_vars: + _log_p_W_local_ = tt.sum( + list(map(self.weighted_logp, self.local_vars)) + ) + else: + _log_p_W_local_ = 0 + _log_p_W_global_ = tt.sum( + list(map(logp, self.global_vars)) + ) + _log_p_W_ = _log_p_W_global_ + _log_p_W_local_ + _log_p_W_ = self.to_flat_input(_log_p_W_) + samples = self.sample_over_space(posterior, _log_p_W_) + return samples + + def apply_replacements(self, node, deterministic=False, include=None, exclude=None): + """ + Replace variables in graph with variational approximation. By default, replaces all variables + + Parameters + ---------- + node : Variable + node for replacements + deterministic : bool + whether to use zeros as initial distribution + if True - zero initial point will produce constant latent variables + include : list + latent variables to be replaced + exclude : list + latent variables to be excluded for replacements + + Returns + ------- + node with replacements + """ + if include is not None and exclude is not None: + raise ValueError('Only one parameter is supported {include|exclude}, got two') + if include is not None: + replacements = {k: v for k, v + in self.flat_view.replacements.items() if k in include} + elif exclude is not None: + replacements = {k: v for k, v + in self.flat_view.replacements.items() if k not in exclude} + else: + replacements = self.flat_view.replacements + node = theano.clone(node, replacements, strict=False) + posterior = self.posterior(zeros=deterministic)[0] + return theano.clone(node, {self.input: posterior}) + + def elbo(self, samples=1, pi_local=1, pi_global=1): + """ + Evidence Lower Bound - ELBO + + To fit variational posterior we need to maximize ELBO + wrt parameters + + Parameters + ---------- + samples : Tensor + number of samples + pi_local : Tensor + weight of local variational part + pi_global : Tensor + weight of global variational part + + Returns + ------- + ELBO samples + """ + samples = tt.as_tensor(samples) + pi_local = tt.as_tensor(pi_local) + pi_global = tt.as_tensor(pi_global) + posterior = self.posterior(samples) + elbo = (self.log_p_D(posterior) + - pi_local * self.log_q_W_local(posterior) + - pi_global * self.log_q_W_global(posterior) + + pi_local * self.log_p_W_local(posterior) + + pi_global * self.log_p_W_global(posterior)) + return elbo + + @property + @memoize + @change_flags(compute_test_value='off') + def posterior_fn(self): + In = theano.In + samples = tt.iscalar('n_samples') + zeros = tt.bscalar('zeros') + posterior = self.posterior(samples, zeros) + fn = theano.function([In(samples, 'samples', 1, allow_downcast=True), + In(zeros, 'zeros', 0, allow_downcast=True)], + posterior) + + def inner(samples=1, zeros=False): + return fn(samples, int(zeros)) + return inner + + def sample_vp(self, draws=1, zeros=False, hide_transformed=False): + if hide_transformed: + vars_sampled = [v_ for v_ in self.model.unobserved_RVs + if not str(v_).endswith('_')] + else: + vars_sampled = [v_ for v_ in self.model.unobserved_RVs] + posterior = self.posterior_fn(draws, zeros) + names = [var.name for var in self.local_vars + self.global_vars] + samples = {name: self.view_from(posterior, name) + for name in names} + + def points(): + for i in range(draws): + yield {name: samples[name][i] + for name in names} + + trace = pm.sampling.NDArray(model=self.model, vars=vars_sampled) + trace.setup(draws=draws, chain=0) + for point in points(): + trace.record(point) + return pm.sampling.MultiTrace([trace]) + + @property + @memoize + def posterior_to_point_fn(self): + names_vars = [var.name for var in self.local_vars + self.global_vars] + names_point = names_vars + [var.name for var in self.model.deterministics] + point_fn = self.model.fastfn(self.local_vars + self.global_vars + self.model.deterministics) + + def inner(posterior): + if posterior.shape[0] > 1: + raise ValueError('Posterior should have one sample') + point = {name: self.view_from(posterior, name)[0] + for name in names_vars} + return dict(zip(names_point, point_fn(point))) + return inner + + @property + def median(self): + posterior = self.posterior_fn(samples=1, zeros=True) + return self.posterior_to_point_fn(posterior) + + def random(self): + posterior = self.posterior_fn(samples=1, zeros=False) + return self.posterior_to_point_fn(posterior) + + @property + def local_vmap(self): + return self.order.vmap[:len(self.local_vars)] + + @property + def global_vmap(self): + return self.order.vmap[len(self.local_vars):] + + @property + def local_size(self): + size = sum([0] + [v.dsize for v in self.local_vars]) + return size + + @property + def global_size(self): + return self.total_size - self.local_size + + @property + def total_size(self): + return self.order.dimensions + + @property + def local_slc(self): + return slice(0, self.local_size) + + @property + def global_slc(self): + return slice(self.local_size, self.total_size) + + +class ADVI(BaseApproximation): + """ADVI algorithm as discribed in Kucukelbir[1]_ with + variance reduction [3]_. ADVI also supports Auto-Encoding + Variational Bayes described in [2]_ and minibatch training. + + The main goal of advi is fast variational inference algorithm + that can be scaled on large problems. The limitation of this + approximation is that latent variables are assumed to be independent + and approximated with normal distribution. These assumptions don't hold + in regular practical task. + + Parameters + ---------- + population : dict[Variable->int] + maps observed_RV to its population size + if not provided defaults to full population + Note: population size is `shape[0]` of the whole data + known : dict[Variable->(mu, rho)] + maps local random variable to mu and rho for posterior parametrization + it is used for Autoencoding Variational Bayes + (AEVB; Kingma and Welling, 2014)[1]_ + model : Model + PyMC3 model that's posterior is going to be approximated + + References + ---------- + .. [1] Kingma, D. P., & Welling, M. (2014). + Auto-Encoding Variational Bayes. stat, 1050, 1. + + .. [2] Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015). + Automatic variational inference in Stan. In Advances in neural + information processing systems (pp. 568-576). + + .. [3] Geoffrey, R., Yuhuai, W., David, D. (2016) + Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI. + NIPS Workshop + """ + def create_shared_params(self): + return {'mu': theano.shared( + self.input.tag.test_value[self.global_slc]), + 'rho': theano.shared( + np.zeros((self.global_size,), dtype=theano.config.floatX))} + + def posterior_global(self, initial): + sd = rho2sd(self.shared_params['rho']) + mu = self.shared_params['mu'] + return sd * initial + mu + + def log_q_W_global(self, posterior): + """ + log_q_W samples over q for global vars + + Gradient wrt mu, rho in density parametrization + is set to zero to lower variance of ELBO + """ + mu = self.shared_params['mu'] + rho = self.shared_params['rho'] + samples = tt.sum(log_normal(posterior[:, self.global_slc], Z(mu), rho=Z(rho)), axis=1) + return samples diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py new file mode 100644 index 0000000000..48bbb8b41c --- /dev/null +++ b/pymc3/variational/inference.py @@ -0,0 +1,147 @@ +import numpy as np +import theano +import pymc3 as pm +from .approximations import ADVI, BaseApproximation +from .advi import adagrad_optimizer +from tqdm import trange + + +APPROXIMATIONS = { + 'advi': ADVI, +} + + +def approximate(n=10000, population=None, local_vars=None, + optimizer=None, method='advi', + samples=1, pi_local=1, pi_global=1, + callbacks=None, learning_rate=.001, epsilon=.1, + model=None, more_params=None, more_updates=None, + *args, **kwargs): + """ + Interface for efficient variational inference with gradient + variance reduction as described in [4]_ + + + PyMC3 supports the following variational inference methods: + 1) Automatic differentiation variational inference (ADVI) [2]_ + + Parameters + ---------- + n : int + number of training iterations + population : dict[Variable->int] + maps observed_RV to its population size + if not provided defaults to full population + Note: population size is `shape[0]` of the whole data + local_vars : dict[Variable->(mu, rho)] + maps local random variable to mu and rho for posterior + parametrization, it is used for Autoencoding Variational Bayes + (AEVB; Kingma and Welling, 2014)[1]_ + optimizer : callable + optional custom optimizer to be called in the following way: + :code:`updates = optimizer(-elbo, list_of_params)` + method : str|Approximation + string description of approximation to be used or + Approximation instance used to calculate elbo and provide + for shared params + samples : int|Tensor + number of Monte Carlo samples used for approximation, + defaults to 1 + pi_local : float|Tensor + pi_local in [0;1] reweighting constant for local KL divergence + this trick was described in [3]_ for fine-tuning minibatch ADVI + pi_global : float|Tensor + pi_global in [0;1] reweighting constant for global KL divergence + this trick was described in [3]_ for fine-tuning minibatch ADVI + callbacks : list[callable] + callables that will be called in the following way + :code:`callback(Approximation, elbo_history, i)` + learning_rate : float + learning rate for adagrad optimizer, + it will be ignored if optimizer is passed + epsilon : float - epsilon for adagrad optimizer, + it will be ignored if optimizer is passed + model : Model + Probabilistic model if function is called out of context + more_params : list + optional more parameters for computing gradients + more_updates : dict + more updates are included in step function + args : additional args that will be passed to Approximation + kwargs : additional kwargs that will be passed to Approximation + + Returns + ------- + (Approximation, elbo_history) + + Notes + ----- + Remember that you can manipulate pi and number of samples with callbacks + + References + ---------- + .. [1] Kingma, D. P., & Welling, M. (2014). + Auto-Encoding Variational Bayes. stat, 1050, 1. + + .. [2] Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015). + Automatic variational inference in Stan. In Advances in neural + information processing systems (pp. 568-576). + + .. [3] Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015). + Weight Uncertainty in Neural Network. In Proceedings of the 32nd + International Conference on Machine Learning (ICML-15) (pp. 1613-1622). + + .. [4] Geoffrey, R., Yuhuai, W., David, D. (2016) + Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI. + NIPS Workshop + """ + if isinstance(method, BaseApproximation): + approx = method + else: + approx = APPROXIMATIONS[method](known=local_vars, population=population, + model=model, *args, **kwargs) + if callbacks is None: + callbacks = [] + if optimizer is None: + optimizer = adagrad_optimizer(learning_rate, epsilon) + elbo = approx.elbo(samples=samples, + pi_local=pi_local, + pi_global=pi_global).mean() + params = approx.params + if more_params is not None: + params += more_params + updates = optimizer(-elbo, params) + if more_updates is not None: + more_updates.update(updates) + updates = more_updates + step = theano.function([], elbo, updates=updates) + i = 0 + elbos = np.empty(n) + try: + progress = trange(n) + for i in progress: + e = step() + elbos[i] = e + for callback in callbacks: + callback(approx, elbos[:i+1], i) + if n < 10: + progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) + elif i % (n // 10) == 0 and i > 0: + avg_elbo = elbos[i - n // 10:i].mean() + progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo)) + except KeyboardInterrupt: + elbos = elbos[:i] + if n < 10: + pm._log.info('Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( + i, 100 * i // n, elbos[i])) + else: + avg_elbo = elbos[i - n // 10:].mean() + pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( + i, 100 * i // n, avg_elbo)) + else: + if n < 10: + pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) + else: + avg_elbo = elbos[-n // 10:].mean() + pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) + return approx, elbos