pymc-devs · smartass101 · Apr 20, 2018 · Apr 25, 2018 · Apr 27, 2018 · Apr 27, 2018
diff --git a/pymc3/gp/__init__.py b/pymc3/gp/__init__.py
@@ -1,4 +1,4 @@
 from . import cov
 from . import mean
 from . import util
-from .gp import Latent, Marginal, MarginalSparse, TP, MarginalKron
+from .gp import Latent, LatentSparse, Marginal, MarginalSparse, TP, MarginalKron
diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
@@ -7,7 +7,8 @@
 from pymc3.gp.cov import Covariance, Constant
 from pymc3.gp.mean import Zero
 from pymc3.gp.util import (conditioned_vars,
-                           infer_shape, stabilize, cholesky, solve_lower, solve_upper)
+                           infer_shape, stabilize, cholesky, solve_lower, solve_upper,
+                           invert_dot, project_inverse)
 from pymc3.distributions import draw_values
 from pymc3.distributions.dist_math import eigh
 from ..math import cartesian, kron_dot, kron_diag
@@ -207,6 +208,226 @@ def conditional(self, name, Xnew, given=None, **kwargs):
         return pm.MvNormal(name, mu=mu, chol=chol, shape=shape, **kwargs)
 
 
+@conditioned_vars(["X", "Xu", "f"])
+class LatentSparse(Latent):
+    R"""
+    Approximate latent Gaussian process (GP).
+
+    The `gp.LatentSparse` class is a direct implementation of a GP.  No addiive
+    noise is assumed.  It is called "Latent" because the underlying function
+    values are treated as latent variables.  It has a `prior` method and a
+    `conditional` method.  Given a mean and covariance function the
+    function :math:`f(X)` is modeled as,
+
+    .. math::
+
+       f \sim \int p(f \mid u) p(u) \mathrm{d}u
+
+    where `u` is the GP function prior on a set of inducing points `Xu`
+
+    .. math::
+
+       u \mid X_u = \sim \text{MvNormal}\left( \mu(X_u), K(X_u, X_u) \right)
+
+    The DTC and FITC approximations use a simplified `p(f | u) ~ q(f | u)`
+    and the resulting `f` is a GP prior only
+    in the case of the FITC approximation.
+
+    Use the `prior` and `conditional` methods to actually construct random
+    variables representing the unknown, or latent, function whose
+    distribution is the GP prior or GP conditional.  This GP implementation
+    can be used to implement regression on data that is not normally
+    distributed.  For more information on the `prior` and `conditional` methods,
+    see their docstrings.
+
+    Parameters
+    ----------
+    cov_func : None, 2D array, or instance of Covariance
+        The covariance function.  Defaults to zero.
+        Must implement the `diag` method for the FITC approximation
+    mean_func : None, instance of Mean
+        The mean function.  Defaults to zero.
+    approx : str
+        Approximation to use. One of ['DTC', 'FITC']
+
+    Examples
+    --------
+    .. code:: python
+
+        # A one dimensional column vector of inputs.
+        X = np.linspace(0, 1, 10)[:, None]
+
+        # A smaller set of inducing inputs
+        Xu = np.linspace(0, 1, 5)[:, None]
+
+        with pm.Model() as model:
+            # Specify the covariance function.
+            cov_func = pm.gp.cov.ExpQuad(1, ls=0.1)
+
+            # Specify the GP.  The default mean function is `Zero`.
+            gp = pm.gp.LatentSparse(cov_func=cov_func)
+
+            # Place a GP prior over the function f.at points X and inducing points Xu
+            f = gp.prior("f", X=X, Xu=Xu)
+
+        ...
+
+        # After fitting or sampling, specify the distribution
+        # at new points with .conditional
+        Xnew = np.linspace(-1, 2, 50)[:, None]
+
+        with model:
+            fcond = gp.conditional("fcond", Xnew=Xnew)
+    """
+
+    _available_approx = ("DTC", "FITC")
+
+    def __init__(self, mean_func=Zero(), cov_func=Constant(0.0), approx="FITC"):
+        if approx not in self._available_approx:
+            raise NotImplementedError(approx)
+        self.approx = approx
+        super(Latent, self).__init__(mean_func, cov_func)
+
+    def _build_prior(self, name, X, Xu, **kwargs):
+        mu = self.mean_func(X)  # (n,)
+        Luu = cholesky(stabilize(self.cov_func(Xu))) # (m, m) \sqrt{K_u}
+        shape_u = infer_shape(Xu, kwargs.pop("shape_u", None))
+        shape = infer_shape(X, kwargs.pop("shape", None))
+        v = pm.Normal(name + "_u_rotated_", mu=0.0, sd=1.0, shape=shape_u, **kwargs)
+        # (m,) prior at inducing points: mean + chol method of MvGaussian
+        u = pm.Deterministic(name + '_u', self.mean_func(Xu) + tt.dot(Luu, v))
+        Kuuiu = invert_dot(Luu, u) # (m,) K_{uu}^{-1} u
+        Kfu = self.cov_func(X, Xu)  # (n, m)
+        f_ = mu + tt.dot(Kfu, Kuuiu) # (n, m) @ (m,) = (n,)
+        if self.approx == 'DTC':
+            f = pm.Deterministic(name, f_)
+        elif self.approx == 'FITC':
+            Qff_diag = project_inverse(Kfu, Luu, diag=True)
+            Kff_diag = self.cov_func(X, diag=True)
+            # MvNormal with diagonal cov is Normal with var=diag(cov)
+            var = tt.clip(Kff_diag - Qff_diag, 0.0, np.inf)
+            f = pm.Normal(name, mu=f_, tau=tt.inv(var), shape=shape)
+        return f
+
+    def prior(self, name, X, Xu, **kwargs):
+        R"""
+        Returns the GP prior distribution evaluated over the input
+        locations `X` with inducing locations `Xu`.
+
+        This is the prior probability over the space
+        of functions described by its mean and covariance function.
+
+        The DTC and FITC approximations use a simplified form of the true conditional `p(f | u)`
+
+        .. math::
+           f \mid X, X_u \sim \text{MvNormal}\left( K(X, X_u) K(X_u, X_u)^{-1} u, K(X, X) - Q(X, X) \right)
+
+        where
+
+        .. math::
+           u \mid X_u \sim \text{MvNormal}\left( \mu(X_u), K(X_u, X_u) \right)
+
+        and
+
+        .. math::
+
+           Q(X, X') = K(X, X_u) K(X_u, X_u)^{-1} K(X_u, X')
+
+        The DTC approximation uses (resulting in a non-GP prior)
+
+        .. math::
+           K(X, X) - Q(X, X) \approx 0
+
+        The FITC approximation uses (resulting in a GP prior)
+
+        .. math::
+
+           K(X, X) - Q(X, X) \approx  \mathrm{diag}(K(X, X) - Q(X, X))
+
+        Parameters
+        ----------
+        name : string
+            Name of the random variable
+        X : array-like
+            Function input values.  If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        Xu: array-like
+            The inducing points.  Must have the same number of columns as `X`.
+        **kwargs
+            Extra keyword arguments that are passed to distribution constructor.
+        """
+
+        f = self._build_prior(name, X, Xu, **kwargs)
+        self.X = X
+        self.Xu = Xu
+        self.f = f
+        return f
+
+    def _get_given_vals(self, given):
+        if given is None:
+            given = {}
+        if 'gp' in given:
+            cov_total = given['gp'].cov_func
+            mean_total = given['gp'].mean_func
+        else:
+            cov_total = self.cov_func
+            mean_total = self.mean_func
+        if all(val in given for val in ['X', 'f', 'Xu']):
+            X, Xu, f = given['X'], given['Xu'], given['f']
+        else:
+            X, Xu, f = self.X, self.Xu, self.f
+        return X, Xu, f, cov_total, mean_total
+
+    def _build_conditional(self, Xnew, X, Xu, f, cov_total, mean_total):
+        Kuu = cov_total(Xu)
+        Luu = cholesky(stabilize(Kuu))
+
+        Kuf = cov_total(Xu, X)
+        Kuffu = tt.dot(Kuf, Kuf.T)
+        Luffu = cholesky(stabilize(Kuffu))
+        Ksu = self.cov_func(Xnew, Xu)
+        r = f - mean_total(X)  # the equations are derived for 0-mean f
+        Kuuiu = invert_dot(Luffu, tt.dot(Kuf, r))
+        mus = self.mean_func(Xnew) + tt.dot(Ksu, Kuuiu)
+        if self.approx == 'FITC':
+            Kff_diag = self.cov_func(X, diag=True)
+            Qff_diag = project_inverse(Kuf.T, Luu, diag=True)
+            Lambda = tt.clip(Kff_diag - Qff_diag, 0.0, np.inf)
+            Qsf = project_inverse(Ksu, Luu, P_T=Kuf)
+            mus += tt.dot(Qsf, f / Lambda)
+        Qss = project_inverse(Ksu, Luu)
+        Kss = self.cov_func(Xnew)
+        cov = Kss - Qss
+        if self.approx == 'FITC':
+            cov -= tt.dot(Qsf, tt.transpose(Qsf / Lambda))
+        return mus, cov
+
+    def conditional(self, name, Xnew, given=None, **kwargs):
+        R"""
+        Returns the approximate conditional distribution evaluated
+        over new input locations `Xnew`.
+
+        Parameters
+        ----------
+        name : string
+            Name of the random variable
+        Xnew : array-like
+            Function input values.
+        given : dict
+            Can optionally take as key value pairs: `X`, `y`, `Xu`,
+            and `gp`.  See the section in the documentation on additive GP
+            models in PyMC3 for more information.
+        **kwargs
+            Extra keyword arguments that are passed to `MvNormal` distribution
+            constructor.
+        """
+        givens = self._get_given_vals(given)
+        mu, cov = self._build_conditional(Xnew, *givens)
+        chol = cholesky(stabilize(cov))
+        shape = infer_shape(Xnew, kwargs.pop("shape", None))
+        return pm.MvNormal(name, mu=mu, chol=chol, shape=shape, **kwargs)
+
+
 @conditioned_vars(["X", "f", "nu"])
 class TP(Latent):
     """

diff --git a/pymc3/gp/util.py b/pymc3/gp/util.py
@@ -9,6 +9,24 @@
 solve_upper = tt.slinalg.Solve(A_structure='upper_triangular')
 solve = tt.slinalg.Solve(A_structure='general')
 
+def invert_dot(L, X):
+    """Wrapper for common pattern K^{-1} @ X where K = L @ L^T"""
+    return solve_upper(L.T, solve_lower(L, X))
+
+def project_inverse(P, L, diag=True, P_T=None):
+    """Wrapper for common pattern P @ K^{-1} @ P^T where K = L @ L^T"""
+    same_P = P_T is None
+    if same_P:
+        P_T = P.T
+    A = solve_lower(L, P_T)
+    if diag:
+        return tt.sum(A * A, axis=0) # the diagonal of A.T @ A
+    else:
+        if same_P:
+            return tt.dot(A.T, A)
+        else:
+            return tt.dot(P, invert_dot(L, P_T))
+
 
 def infer_shape(X, n_points=None):
     if n_points is None:

diff --git a/pymc3/tests/test_gp.py b/pymc3/tests/test_gp.py
@@ -683,6 +683,67 @@ def testLatent2(self):
         npt.assert_allclose(latent_logp, self.logp, atol=5)
 
 
+class TestLatentVsLatentSparse(object):
+    R"""
+    Compare logp of models Latent and LatentSparse.
+    Should be nearly equal when inducing points are same as inputs.
+    """
+    def setup_method(self):
+        X = np.random.randn(50,3)
+        y = np.random.randn(50)*0.01
+        Xnew = np.random.randn(60, 3)
+        pnew = np.random.randn(60)*0.01
+        with pm.Model() as model:
+            cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
+            mean_func = pm.gp.mean.Constant(0.5)
+            gp = pm.gp.Latent(mean_func, cov_func)
+            f = gp.prior("f", X)
+        chol = np.linalg.cholesky(cov_func(X).eval())
+        y_rotated = np.linalg.solve(chol, y - mean_func(X).eval())
+        logp_params = { 'f_rotated_': y_rotated }
+        self.logp_prior = model.logp(logp_params)
+        with model:
+            p = gp.conditional("p", Xnew)
+        logp_params['p'] = pnew
+        self.logp_conditional = model.logp(logp_params)
+        self.X = X
+        self.Xnew = Xnew
+        self.y = y
+        self.pnew = pnew
+        self.gp = gp
+
+    @pytest.mark.parametrize('approx', ['FITC', 'DTC'])
+    def test_approximations(self, approx):
+        with pm.Model() as model:
+            cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
+            mean_func = pm.gp.mean.Constant(0.5)
+            gp = pm.gp.LatentSparse(mean_func, cov_func, approx=approx)
+            f = gp.prior("f", self.X, self.X)
+        chol = np.linalg.cholesky(cov_func(self.X).eval())
+        y_rotated = np.linalg.solve(chol, self.y - mean_func(self.X).eval())
+        model_params = {
+            "f_u_rotated_": y_rotated,
+        }
+        if approx == 'FITC':
+            model_params['f'] = self.y  # need to specify as well since f ~ Normal(f_, diag(Kff-Qff))
+        # test prior logp
+        approx_prior_logp = model.logp(model_params)
+        if approx == 'FITC':
+            # for X=Xu FITC degenerates to DTC and the small residual diag(Kff - Qff) offsets the logp
+            fitc_logp = pm.Normal.dist(mu=f.distribution.mu, sd=f.distribution.sd).logp_sum(self.y)
+            atol = -pm.distributions.draw_values([fitc_logp], model_params)[0]
+        else:
+            atol = 0
+        npt.assert_allclose(approx_prior_logp, self.logp_prior, atol=atol, rtol=1e-2)
+        # test prior + conditional logp
+        with model:
+            p = gp.conditional("p", self.Xnew)
+        model_params['p'] = self.pnew
+        approx_cond_logp = model.logp(model_params)
+        npt.assert_allclose(approx_cond_logp, self.logp_conditional, atol=0, rtol=5e-2)
+
+
+
 class TestMarginalVsMarginalSparse(object):
     R"""
     Compare logp of models Marginal and MarginalSparse.