pymc-devs · smartass101 · Apr 20, 2018 · Apr 25, 2018 · Apr 27, 2018 · Apr 27, 2018
diff --git a/docs/source/notebooks/GP-LatentSparse.ipynb b/docs/source/notebooks/GP-LatentSparse.ipynb
diff --git a/pymc3/gp/__init__.py b/pymc3/gp/__init__.py
@@ -1,4 +1,4 @@
 from . import cov
 from . import mean
 from . import util
-from .gp import Latent, Marginal, MarginalSparse, TP, MarginalKron
+from .gp import Latent, LatentSparse, Marginal, MarginalSparse, TP, MarginalKron
diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
@@ -7,7 +7,8 @@
 from pymc3.gp.cov import Covariance, Constant
 from pymc3.gp.mean import Zero
 from pymc3.gp.util import (conditioned_vars,
-                           infer_shape, stabilize, cholesky, solve_lower, solve_upper)
+                           infer_shape, stabilize, cholesky, solve_lower, solve_upper,
+                           invert_dot, project_inverse)
 from pymc3.distributions import draw_values
 from pymc3.distributions.dist_math import eigh
 from ..math import cartesian, kron_dot, kron_diag
@@ -207,6 +208,251 @@ def conditional(self, name, Xnew, given=None, **kwargs):
         return pm.MvNormal(name, mu=mu, chol=chol, shape=shape, **kwargs)
 
 
+@conditioned_vars(["X", "Xu", "f"])
+class LatentSparse(Latent):
+    R"""
+    Approximate latent Gaussian process (GP).
+
+    The `gp.LatentSparse` class is a direct implementation of a GP.  No addiive
+    noise is assumed.  It is called "Latent" because the underlying function
+    values are treated as latent (unobserved) variables. It has a `prior` method
+    and a `conditional` method.  Given a mean and covariance function the
+    function :math:`f(X)` is modeled as,
+
+    .. math::
+
+       f \sim \int p(f \mid u) p(u) \mathrm{d}u
+
+    where `u` is the GP function prior on a set of inducing points `Xu`
+
+    .. math::
+
+       u \mid X_u = \sim \text{MvNormal}\left( \mu(X_u), K(X_u, X_u) \right)
+
+    The DTC and FITC approximations as formulated in
+    `Quinonero-Candela+Rasmussen, 2006`_ use a simplified `p(f | u) ~ q(f | u)`.
+    These approximations essentially use an approximate conditional distribution
+    over the full range of points `X` given a non-sparse Latent GP
+    over a small set of points `Xu` with a simplified (0 or diagonal, respectively)
+    conditional covariance matrix over the points `X`.
+
+    Use the `prior` and `conditional` methods to actually construct random
+    variables representing the unknown, or latent, function whose
+    distribution is the GP prior or GP conditional.  This GP implementation
+    can be used to implement regression on data that is not normally
+    distributed.  For more information on the `prior` and `conditional` methods,
+    see their docstrings.
+
+    .. _`Quinonero-Candela+Rasmussen, 2006`: http://www.jmlr.org/papers/v6/quinonero-candela05a.html
+
+    Parameters
+    ----------
+    cov_func : None, 2D array, or instance of Covariance
+        The covariance function.  Defaults to zero.
+    mean_func : None, instance of Mean
+        The mean function.  Defaults to zero.
+    approx : str
+        Approximation to use. One of ('DTC', 'FITC')
+
+    Examples
+    --------
+    .. code:: python
+
+        # A one dimensional column vector of inputs.
+        X = np.linspace(0, 1, 10)[:, None]
+
+        # A smaller set of inducing inputs
+        Xu = np.linspace(0, 1, 5)[:, None]
+
+        with pm.Model() as model:
+            # Specify the covariance function.
+            cov_func = pm.gp.cov.ExpQuad(1, ls=0.1)
+
+            # Specify the GP.  The default mean function is `Zero`.
+            gp = pm.gp.LatentSparse(cov_func=cov_func)
+
+            # Place a GP prior over the function f.at points X and inducing points Xu
+            f = gp.prior("f", X=X, Xu=Xu)
+
+        ...
+
+        # After fitting or sampling, specify the distribution
+        # at new points with .conditional
+        Xnew = np.linspace(-1, 2, 50)[:, None]
+
+        with model:
+            fcond = gp.conditional("fcond", Xnew=Xnew)
+    """
+
+    _available_approx = ("DTC", "FITC")
+
+    def __init__(self, mean_func=Zero(), cov_func=Constant(0.0), approx="FITC"):
+        if approx not in self._available_approx:
+            raise NotImplementedError(approx)
+        self.approx = approx
+        super(Latent, self).__init__(mean_func, cov_func)
+
+    def _build_prior(self, name, X, Xu, **kwargs):
+        mu = self.mean_func(X)  # (n,)
+        Luu = cholesky(stabilize(self.cov_func(Xu))) # (m, m) \sqrt{K_u}
+        shape_u = infer_shape(Xu, kwargs.pop("shape_u", None))
+        shape = infer_shape(X, kwargs.pop("shape", None))
+        v = pm.Normal(name + "_u_rotated_", mu=0.0, sd=1.0, shape=shape_u, **kwargs)
+        # (m,) prior at inducing points: mean + chol method of MvGaussian
+        u = pm.Deterministic(name + '_u', self.mean_func(Xu) + tt.dot(Luu, v))
+        Kuuiu = invert_dot(Luu, u) # (m,) K_{uu}^{-1} u
+        Kfu = self.cov_func(X, Xu)  # (n, m)
+        f_ = mu + tt.dot(Kfu, Kuuiu) # (n, m) @ (m,) = (n,)
+        if self.approx == 'DTC':
+            f = pm.Deterministic(name, f_)
+        elif self.approx == 'FITC':
+            Qff_diag = project_inverse(Kfu, Luu, diag=True)
+            Kff_diag = self.cov_func(X, diag=True)
+            # MvNormal with diagonal cov is Normal with var=diag(cov)
+            var = tt.clip(Kff_diag - Qff_diag, 0.0, np.inf)
+            # The noise about the DTC mean according to the FITC diagonal cov
+            v2 = pm.Normal(name + '_FITC_noise_', tau=tt.inv(var), shape=shape)
+            f = pm.Deterministic(name, f_ + v2)  # add the FITC noise to the DTC mean
+        return f
+
+    def prior(self, name, X, Xu, **kwargs):
+        R"""
+        Returns the GP prior distribution evaluated over the input
+        locations `X` with inducing locations `Xu`.
+
+        This is the prior probability over the space
+        of functions described by its mean and covariance function.
+
+        The DTC and FITC approximations use a simplified form of the true conditional `p(f | u)`
+
+        .. math::
+           f \mid X, X_u \sim \text{MvNormal}\left( K(X, X_u) K(X_u, X_u)^{-1} u, K(X, X) - Q(X, X) \right)
+
+        where
+
+        .. math::
+           u \mid X_u \sim \text{MvNormal}\left( \mu(X_u), K(X_u, X_u) \right)
+
+        and
+
+        .. math::
+
+           Q(X, X') = K(X, X_u) K(X_u, X_u)^{-1} K(X_u, X')
+
+        The DTC (deterministic training conditional) approximation uses
+
+        .. math::
+           K(X, X) - Q(X, X) \approx 0
+
+        which means the DTC approximation is essentially just a purely
+        deterministic mean of the conditional distribution `f | u`.
+
+        The FITC (fully independent training conditional) approximation uses
+
+        .. math::
+
+           K(X, X) - Q(X, X) \approx  \mathrm{diag}(K(X, X) - Q(X, X))
+
+        which adds an iid (diagonal) Normal noise about the DTC mean
+        corresponding to the true conditional covariance diagonal.
+
+        Parameters
+        ----------
+        name : string
+            Name of the random variable
+        X : array-like
+            Function input values.  If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        Xu: array-like
+            The inducing points.  Must have the same number of columns as `X`.
+        **kwargs
+            Extra keyword arguments that are passed to distribution constructor.
+            The `shape` and `shape_u` can specify the number of points in X or Xu, respectively.
+            This is necessary in cases when the shape cannot be inferred.
+        """
+
+        f = self._build_prior(name, X, Xu, **kwargs)
+        self.X = X
+        self.Xu = Xu
+        self.f = f
+        return f
+
+    def _get_given_vals(self, given):
+        if given is None:
+            given = {}
+        if 'gp' in given:
+            cov_total = given['gp'].cov_func
+            mean_total = given['gp'].mean_func
+        else:
+            cov_total = self.cov_func
+            mean_total = self.mean_func
+        if all(val in given for val in ['X', 'f', 'Xu']):
+            X, Xu, f = given['X'], given['Xu'], given['f']
+        else:
+            X, Xu, f = self.X, self.Xu, self.f
+        return X, Xu, f, cov_total, mean_total
+
+    def _build_conditional(self, Xnew, X, Xu, f, cov_total, mean_total):
+        Kuu = cov_total(Xu)
+        Luu = cholesky(stabilize(Kuu))
+
+        Kuf = cov_total(Xu, X)
+        if self.approx == 'FITC':
+            Kff_diag = self.cov_func(X, diag=True)
+            Qff_diag = project_inverse(Kuf.T, Luu, diag=True)
+            Lambda = tt.clip(Kff_diag - Qff_diag, 0.0, np.inf)
+            Kfu = (Kuf / Lambda).T
+        elif self.approx == 'DTC':
+            Kfu = Kuf.T
+        Kuffu = tt.dot(Kuf, Kfu)
+        Luffu = cholesky(stabilize(Kuffu))
+        Ksu = self.cov_func(Xnew, Xu)
+        r = f - mean_total(X)  # the equations are derived for 0-mean f
+        if self.approx == 'FITC':
+            r /= Lambda
+        # reconstruct K_{uu}^{-1} u given f
+        Kuuiu = invert_dot(Luffu, tt.dot(Kuf, r))
+        # the exact conditional mean f_* | u
+        mus = self.mean_func(Xnew) + tt.dot(Ksu, Kuuiu)
+        Qss = project_inverse(Ksu, Luu)
+        Kss = self.cov_func(Xnew)
+        cov = Kss - Qss         # the exact test conditional covariance
+        return mus, cov
+
+    def conditional(self, name, Xnew, given=None, **kwargs):
+        R"""
+        Returns the approximate conditional distribution evaluated
+        over new input locations `Xnew`.
+
+        The DTC and FITC approximations
+        both use the exact test conditional `p(f_* | u)` as an approximation
+        for `p(f_* | f)`. The `u` is reconstructed from the given `f`.
+
+        Note that the conditional distribution
+        is not as sparse as the prior, because it evaluates
+        the full covariance matrix for `Xnew`.
+
+        Parameters
+        ----------
+        name : string
+            Name of the random variable
+        Xnew : array-like
+            Function input values.
+        given : dict
+            Can optionally take as key value pairs: `X`, `f`, `Xu`,
+            and `gp`.  See the section in the documentation on additive GP
+            models in PyMC3 for more information.
+        **kwargs
+            Extra keyword arguments that are passed to `MvNormal` distribution
+            constructor.
+        """
+        givens = self._get_given_vals(given)
+        mu, cov = self._build_conditional(Xnew, *givens)
+        chol = cholesky(stabilize(cov))
+        shape = infer_shape(Xnew, kwargs.pop("shape", None))
+        return pm.MvNormal(name, mu=mu, chol=chol, shape=shape, **kwargs)
+
+
 @conditioned_vars(["X", "f", "nu"])
 class TP(Latent):
     """

diff --git a/pymc3/gp/util.py b/pymc3/gp/util.py
@@ -9,6 +9,24 @@
 solve_upper = tt.slinalg.Solve(A_structure='upper_triangular')
 solve = tt.slinalg.Solve(A_structure='general')
 
+def invert_dot(L, X):
+    """Wrapper for common pattern K^{-1} @ X where K = L @ L^T"""
+    return solve_upper(L.T, solve_lower(L, X))
+
+def project_inverse(P, L, diag=True, P_T=None):
+    """Wrapper for common pattern P @ K^{-1} @ P^T where K = L @ L^T"""
+    same_P = P_T is None
+    if same_P:
+        P_T = P.T
+    A = solve_lower(L, P_T)
+    if diag:
+        return tt.sum(A * A, axis=0) # the diagonal of A.T @ A
+    else:
+        if same_P:
+            return tt.dot(A.T, A)
+        else:
+            return tt.dot(P, invert_dot(L, P_T))
+
 
 def infer_shape(X, n_points=None):
     if n_points is None:
@@ -71,7 +89,7 @@ def setter(self, val):
     return gp_wrapper
 
 
-def plot_gp_dist(ax, samples, x, plot_samples=True, palette="Reds"):
+def plot_gp_dist(ax, samples, x, plot_samples=True, palette="Reds", label=None):
     """ A helper function for plotting 1D GP posteriors from trace """
     import matplotlib.pyplot as plt
 
@@ -80,11 +98,13 @@ def plot_gp_dist(ax, samples, x, plot_samples=True, palette="Reds"):
     colors = (percs - np.min(percs)) / (np.max(percs) - np.min(percs))
     samples = samples.T
     x = x.flatten()
+    i_last = len(percs) - 1
     for i, p in enumerate(percs[::-1]):
         upper = np.percentile(samples, p, axis=1)
         lower = np.percentile(samples, 100-p, axis=1)
         color_val = colors[i]
-        ax.fill_between(x, upper, lower, color=cmap(color_val), alpha=0.8)
+        lab = label if i == i_last else None
+        ax.fill_between(x, upper, lower, color=cmap(color_val), alpha=0.8, label=lab)
     if plot_samples:
         # plot a few samples
         idx = np.random.randint(0, samples.shape[1], 30)