-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
initial working draft of kameleon mcmc #599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
''' | ||
Created on Aug 29, 2014 | ||
|
||
@author: Heiko Strathmann | ||
''' | ||
|
||
from scipy.spatial.distance import squareform, pdist | ||
|
||
from theano import function | ||
import theano | ||
from theano.configparser import TheanoConfigParser | ||
|
||
import numpy as np | ||
from pymc.model import modelcontext | ||
from pymc.step_methods.arraystep import ArrayStep, metrop_select | ||
from pymc.step_methods.metropolis import tune | ||
import scipy as sp | ||
import theano.tensor as T | ||
|
||
from ..core import * | ||
|
||
|
||
# To avoid Theano complaining about missing test values | ||
TheanoConfigParser().compute_test_value = 'off' | ||
|
||
|
||
__all__ = ['KameleonOracle'] | ||
|
||
|
||
class KameleonOracle(ArrayStep): | ||
""" | ||
Kernel Adaptive Metropolis-Hastings sampling step with a fixed set of orcale | ||
samples. Automatic tuning of the scaling is possible via a simple schedule | ||
for a given number of iterations. | ||
|
||
Based on "Kernel Adaptive Metopolis Hastings" by D. Sejdinovic, H. Strathmann, | ||
M. Lomeli, C. Andrieu, A. Gretton | ||
http://jmlr.org/proceedings/papers/v32/sejdinovic14.html | ||
|
||
See also https://github.com/karlnapf/kameleon-mcmc for experimental code. | ||
|
||
Parameters | ||
---------- | ||
vars : list | ||
List of variables for sampler | ||
Z : 2d numpy array | ||
Oracle sample to represent target covariance structure | ||
proposal_dist : function | ||
Function that returns zero-mean deviates when parameterized with | ||
S (and n). Defaults to quad_potential. | ||
gamma2 : scalar | ||
Exploration term in proposal | ||
nu2 : scalar | ||
Scaling of the covariance part of the proposal | ||
kernel : Kernel instance | ||
Kernel to use for representing covariance structure in feature space. | ||
Must implement interface for kernel and gradient, see for example GaussianKernel | ||
model : PyMC Model | ||
Optional model for sampling step. Defaults to None (taken from context). | ||
|
||
""" | ||
def __init__(self, vars=None, Z=None, gamma2=0.1, nu2=1., kernel=None, | ||
tune=True, tune_interval=100, model=None, dist=None): | ||
model = modelcontext(model) | ||
if vars is None: | ||
vars = model.vars | ||
|
||
self.Z = Z | ||
self.kernel = kernel | ||
self.gamma2 = gamma2 | ||
self.nu2 = nu2 | ||
self.tune = tune | ||
|
||
# empty proposal distribution and last likelihood | ||
self.q_dist = None | ||
self.log_target = -np.inf | ||
|
||
# statistics for tuning scaling | ||
self.tune = tune | ||
self.tune_interval = tune_interval | ||
self.steps_until_tune = tune_interval | ||
self.accepted = 0 | ||
|
||
super(KameleonOracle, self).__init__(vars, [model.fastlogp]) | ||
|
||
def astep(self, q0, logp): | ||
# sample from kernel based Gaussian proposal | ||
q_dist = self.construct_proposal(q0) | ||
q = np.ravel(q_dist.sample()) | ||
|
||
# evaluate target log probability | ||
logp_q = logp(q) | ||
|
||
# MH accept/reject step | ||
if self.q_dist is None: | ||
q_new = q | ||
else: | ||
q_new = metrop_select(logp_q + q_dist.log_pdf(q0) \ | ||
- self.log_pdf_target - self.q_dist.log_pdf(q), q, q0) | ||
|
||
# adapt | ||
if self.tune and not self.steps_until_tune: | ||
# tune scaling parameter using metropolis method | ||
self.nu2 = tune(self.nu2, self.accepted / float(self.tune_interval)) | ||
# Reset counter | ||
self.steps_until_tune = self.tune_interval | ||
self.accepted = 0 | ||
|
||
# update log-pdf and proposal distribution object on accept | ||
if any(q_new != q0): | ||
self.accepted += 1 | ||
self.q_dist = q_dist | ||
self.log_pdf_target = logp_q | ||
|
||
self.steps_until_tune -= 1 | ||
|
||
return q_new | ||
|
||
def compute_constants(self, y): | ||
""" | ||
Pre-computes constants of the log density of the proposal distribution, | ||
which is Gaussian as p(x|y) ~ N(mu, R) | ||
where | ||
mu = y-a | ||
a = 0 | ||
R = gamma^2 I + M M^T | ||
M = 2 [\nabla_x k(x,z_i]|_x=y | ||
|
||
Returns (mu,L_R), where L_R is lower Cholesky factor of R | ||
""" | ||
assert(len(np.shape(y)) == 1) | ||
|
||
# M = 2 [\nabla_x k(x,z_i]|_x=y | ||
R = self.gamma2 * np.eye(len(y)) | ||
if self.Z is not None: | ||
M = 2 * self.kernel.gradient(y, self.Z) | ||
# R = gamma^2 I + \nu^2 * M H M^T | ||
H = np.eye(len(self.Z)) - 1.0 / len(self.Z) | ||
R += self.nu2 * M.T.dot(H.dot(M)) | ||
|
||
L_R = np.linalg.cholesky(R) | ||
|
||
return y.copy(), L_R | ||
|
||
def construct_proposal(self, y): | ||
""" | ||
Constructs the Kameleon MCMC proposal centred at y, using history Z | ||
|
||
The proposal is a Gaussian based on the kernel values between y and all | ||
points in the chain history. | ||
""" | ||
mu, L = self.compute_constants(y) | ||
|
||
return Gaussian(mu, L, is_cholesky=True) | ||
|
||
class Gaussian(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A better approach would be to upgrade our distributions to be able to handle these sorts of things, rather than have distribution classes in here. I had started working on random number generators, but have not finished the work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I will add this to the multivariate normal then. Point is that it is possible to represent a Gaussian interms of a Cholesky, to avoid wasteful re-computation of the linear solves. Unfortunately, Theano does not yet support that, I sent them a patch however, see Theano/Theano#2059 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok so doing this in terms of But where to put the sampling? Should I just for now copy paste the code directly into the sampler? Might hurt later though. What are your thoughts? |
||
""" | ||
Helper class to sample from and evaluate log-pdf of a multivariate Gaussian, | ||
using efficient Cholesky based representation (Cholesky only computed once) | ||
""" | ||
def __init__(self, mu, Sigma, is_cholesky=False): | ||
self.mu = mu | ||
self.is_cholesky = is_cholesky | ||
|
||
if self.is_cholesky: | ||
self.L = Sigma | ||
else: | ||
self.L = np.linalg.cholesky(Sigma) | ||
|
||
self.dimension = len(mu) | ||
|
||
def log_pdf(self, X): | ||
# duck typing for shape | ||
if len(np.shape(X)) == 1: | ||
X = X.reshape(1, len(X)) | ||
|
||
log_determinant_part = -sum(np.log(np.diag(self.L))) | ||
|
||
quadratic_parts = np.zeros(len(X)) | ||
for i in range(len(X)): | ||
x = X[i] - self.mu | ||
|
||
# solve y=K^(-1)x = L^(-T)L^(-1)x | ||
y = sp.linalg.solve_triangular(self.L, x.T, lower=True) | ||
y = sp.linalg.solve_triangular(self.L.T, y, lower=False) | ||
quadratic_parts[i] = -0.5 * x.dot(y) | ||
|
||
const_part = -0.5 * len(self.L) * np.log(2 * np.pi) | ||
|
||
return const_part + log_determinant_part + quadratic_parts | ||
|
||
def sample(self, n=1): | ||
V = np.random.randn(self.dimension, n) | ||
|
||
# map to our desired Gaussian and transpose to have row-wise vectors | ||
return self.L.dot(V).T + self.mu | ||
|
||
def theano_sq_dists_mat_expr(X, Y): | ||
return (-2 * X.dot(Y.T).T + T.sum(X ** 2, 1).T).T + T.sum(Y ** 2, 1) | ||
|
||
def theano_gaussian_kernel_expr(sq_dists, sigma): | ||
return T.exp(-sq_dists / (2.*sigma ** 2)) | ||
|
||
def theano_sq_dists_vec_expr(x, Y): | ||
# element wise vector norm | ||
Y_norm = T.sum(Y ** 2, 1) | ||
xY_terms = x.T.dot(Y.T) | ||
|
||
# expanded sq euclidean distance | ||
return T.sum(x ** 2) + Y_norm - 2 * xY_terms | ||
|
||
class GaussianKernel(): | ||
""" | ||
Helper class to represent a Gaussian kernel, with methods to compute kernel | ||
function and its gradient wrt the left argument. | ||
Uses Theano's autodiff for computing kernel gradients. | ||
""" | ||
|
||
# compile theano functions | ||
X = T.dmatrix('X') | ||
x = T.dvector('x') | ||
Y = T.dmatrix('Y') | ||
sigma = T.dscalar('sigma') | ||
|
||
# kernel expressions as for left input being matrix or vector | ||
sq_dist_mat_expr = theano_sq_dists_mat_expr(X, Y) | ||
sq_dist_vec_expr = theano_sq_dists_vec_expr(x, Y) | ||
K_expr = theano_gaussian_kernel_expr(sq_dist_mat_expr, sigma) | ||
k_expr = theano_gaussian_kernel_expr(sq_dist_vec_expr, sigma) | ||
|
||
# compile | ||
theano_kernel_mat = function(inputs=[X, Y, sigma], outputs=K_expr) | ||
theano_kernel_vec = function(inputs=[x, Y, sigma], outputs=k_expr) | ||
theano_kernel_vec_grad_x = function(inputs=[x, Y, sigma], | ||
outputs=theano.gradient.jacobian(k_expr, x)) | ||
|
||
@staticmethod | ||
def gaussian_median_heuristic(X): | ||
dists = squareform(pdist(X, 'sqeuclidean')) | ||
median_dist = np.median(dists[dists > 0]) | ||
sigma = np.sqrt(0.5 * median_dist) | ||
return sigma | ||
|
||
def __init__(self, width): | ||
self.width = width | ||
|
||
def kernel(self, X, Y=None): | ||
""" | ||
Computes the standard Gaussian kernel k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2) | ||
|
||
X - 2d array, samples on right hand side | ||
Y - 2d array, samples on left hand side, can be None in which case its replaced by X | ||
""" | ||
return self.theano_kernel(X, Y, self.width) | ||
|
||
def gradient(self, x, Y): | ||
""" | ||
Computes the gradient of the Gaussian kernel wrt. to the left argument, i.e. | ||
k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2), which is | ||
\nabla_x k(x,y)=1.0/sigma**2 k(x,y)(y-x) | ||
Given a set of row vectors Y, this computes the | ||
gradient for every pair (x,y) for y in Y. | ||
|
||
x - single sample on right hand side (1d array) | ||
Y - samples on left hand side (2d array) | ||
""" | ||
return self.theano_kernel_vec_grad_x(x, Y, self.width) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure whether you allow this, but compiling my expressions fails otherwise.