diff --git a/pymc/distributions/continuous.py b/pymc/distributions/continuous.py
index 1c7c4c2537..55b6fec986 100644
--- a/pymc/distributions/continuous.py
+++ b/pymc/distributions/continuous.py
@@ -8,6 +8,7 @@
 from __future__ import division
 
 from .dist_math import *
+import numpy
 from numpy.random import uniform as runiform, normal as rnormal
 
 __all__ = ['Uniform', 'Flat', 'Normal', 'Beta', 'Exponential', 'Laplace',
@@ -54,7 +55,7 @@ def logp(self, value):
         upper = self.upper
 
         return bound(
-            -log(upper - lower),
+            -log(upper - lower), # Inserted cast to prevent exception in theano.ifelse.ifelse
             lower <= value, value <= upper)
 
     def random(self, size=None):
diff --git a/pymc/distributions/discrete.py b/pymc/distributions/discrete.py
index fa78baa11f..bdc517650f 100644
--- a/pymc/distributions/discrete.py
+++ b/pymc/distributions/discrete.py
@@ -285,7 +285,7 @@ def logp(self, value):
         p = self.p
         k = self.k
 
-        return bound(log(p[value]),
+        return bound_scalar(t.sum(log(p[value])),
             value >= 0,
             value <= (k - 1),
             le(abs(sum(p) - 1), 1e-5))
diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index 08080e5572..f77ab899ed 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -13,31 +13,68 @@
     zeros_like, ones, ones_like,
     concatenate, constant, argmax)
 
+import theano
 
-from numpy import pi, inf, nan
-from .special import gammaln, multigammaln
 
+from numpy import pi, inf, nan, float64, array
+from .special import gammaln, multigammaln
+from theano.ifelse import ifelse
 from theano.printing import Print
 from .distribution import *
 
+def guess_is_scalar(c):
+    bcast = None
+    if (isinstance(c, t.TensorType)):
+        return c.broadcastable == ()
+    
+    if (isinstance(c, t.Variable) or isinstance(c, t.Constant)):
+        return c.type.broadcastable == ()
+    
+    tt = t.as_tensor_variable(c)
+    return tt.type.broadcastable == ()
 
-def bound(logp, *conditions):
+impossible = constant(-inf, dtype=float64) # We re-use that constant
+
+def bound_scalar(logp, *conditions):
     """
     Bounds a log probability density with several conditions
 
     Parameters
     ----------
     logp : float
-    *conditionss : booleans
+    *conditions : booleans
 
     Returns
     -------
     logp if all conditions are true
     -inf if some are false
+    """        
+    cond = alltrue(conditions)
+    return ifelse(cond, t.cast(logp, dtype='float64'), impossible)
+
+  
+def bound(logp, *conditions):
     """
+    Bounds a log probability density with several conditions
 
-    return switch(alltrue(conditions), logp, -inf)
+    Parameters
+    ----------
+    logp : float
+    *conditionss : booleans
 
+    Returns
+    -------
+    logp if all conditions are true
+    -inf if some are false
+    """
+    if (guess_is_scalar(logp)):
+        for c in conditions:
+            if (type(c)==bool):
+                continue
+            if (not guess_is_scalar(c)):
+                break
+            return bound_scalar(logp, *conditions)
+    return switch(alltrue(conditions), logp, impossible)
 
 def alltrue(vals):
     ret = 1
@@ -52,6 +89,11 @@ def logpow(x, m):
     """
     return switch(eq(x, 0) & eq(m, 0), 0, m * log(x))
 
+def logpow_relaxed(x, m):
+    """
+    Calculates log(x**m), unless m==0 or x==0 in which case it will return 0 
+    """
+    return switch(eq(x, 0) | eq(m, 0), 0, m * log(x))
 
 def factln(n):
     return gammaln(n + 1)
diff --git a/pymc/distributions/mixtures.py b/pymc/distributions/mixtures.py
new file mode 100644
index 0000000000..a8303607b0
--- /dev/null
+++ b/pymc/distributions/mixtures.py
@@ -0,0 +1,117 @@
+
+
+from pymc import *
+
+from numpy.random import normal
+import numpy as np
+import pylab as pl
+from itertools import product
+import theano
+import theano.tensor as T
+from theano.ifelse import ifelse
+from numpy import pi
+from numpy.random import uniform, multivariate_normal
+from pymc.model import FreeRV, ObservedRV
+import numpy as np
+import numpy.linalg as np_linalg
+from pymc.step_methods.metropolis_hastings import GenericProposal
+
+"""
+Multivariate Mixture distributions. 
+
+These distributions have a lot of potential uses. For one thing: They can be used to approximate every other density function,
+and there are a lot of algorithms for efficiently creating density estimates (i.e. Kernel Density Estimates (KDE),
+Gaussian Mixture Model learning and others.
+
+Once we have learned such a density, we could use it as a prior density. To do that, we need to be able to
+efficiently sample from it. Luckily, that's simple and efficient. Unless you use the wrong proposal distribution, of course.
+
+That's where these classes come in handy.
+
+@author Kai Londenberg ( Kai.Londenberg@gmail.com )    
+
+"""
+class MvGaussianMixture(Continuous):
+    
+    def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
+        '''
+        Creates a continuous mixture distribution which can be efficiently evaluated and sampled from.
+         
+        Args:
+            shape: Shape of the distribution. All kernels need to have this same shape as well.
+            weights: weight vector (may be a theano shared variable or expression. Must be able to evaluate this in the context of the model).
+            kernels: list of mixture component distributions. These have to be MixtureKernel instances (for example MvNormalKernel ) of same shape
+            testval: Test value for logp computations
+        '''
+        super(MvGaussianMixture, self).__init__(*args, shape=shape, **kwargs)
+        assert isinstance(cat_var, FreeRV)
+        assert isinstance(cat_var.distribution, Categorical)
+        self.cat_var = cat_var
+        self.model = modelcontext(model)
+        weights = cat_var.distribution.p
+        self.weights = weights
+        self.mus = mus
+        self.taus = taus
+        self.mu_t = T.stacklists(mus)
+        self.tau_t = T.stacklists(taus)
+        self.shape = shape
+        self.testval = np.zeros(self.shape, self.dtype)
+        self.last_cov_value = {}
+        self.last_tau_value = {}
+        self.param_fn = None
+        
+    def logp(self, value):
+        mu = self.mu_t[self.cat_var]
+        tau = self.tau_t[self.cat_var]
+
+        delta = value - mu
+        k = tau.shape[0]
+
+        return 1/2. * (-k * log(2*pi) + log(det(tau)) - dot(delta.T, dot(tau, delta)))
+    
+    def draw_mvn(self, mu, cov):
+        
+        return np.random.multivariate_normal(mean=mu, cov=cov)
+    
+    def draw(self, point):
+        if (self.param_fn is None):
+            self.param_fn = model.fastfn([self.cat_var, self.mu_t[self.cat_var], self.tau_t[self.cat_var]])
+        
+        cat, mu, tau = self.param_fn(point)
+        # Cache cov = inv(tau) for each value of cat
+        cat = int(cat)
+        last_tau = self.last_tau_value.get(cat, None)
+        if (last_tau is not None and np.allclose(tau,last_tau)):
+            mcov = self.last_cov_value[cat]
+        else:
+            mcov = np_linalg.inv(tau)
+            self.last_cov_value[cat] = mcov
+            self.last_tau_value[cat] = tau
+        
+        return self.draw_mvn(mu, mcov)
+    
+_impossible = float('-inf')
+class GMMProposal(GenericProposal):
+    
+    def __init__(self, gmm_var, model=None):
+        super(GMMProposal, self).__init__([gmm_var])
+        assert isinstance(gmm_var.distribution, MvGaussianMixture)
+        model = modelcontext(model)
+        self.gmm_var = gmm_var
+        self.logpfunc = model.fastfn(gmm_var.distribution.logp(gmm_var))
+        self._proposal_logp_difference = 0.0
+        
+        
+    def propose_move(self, from_point, point):
+        old_logp = self.logpfunc(from_point) 
+        point[self.gmm_var.name] = self.gmm_var.distribution.draw(point)
+        new_logp = self.logpfunc(point) 
+        if (old_logp!=np.nan and old_logp!=_impossible):
+            self._proposal_logp_difference = old_logp-new_logp
+        else:
+            self._proposal_logp_difference = 0.0            
+        return point
+
+    def proposal_logp_difference(self):
+        return self._proposal_logp_difference
+
diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index 945e39265c..76eda5d61f 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -1,7 +1,8 @@
 from .dist_math import *
 
 from theano.sandbox.linalg import det, solve, matrix_inverse, trace
-from theano.tensor import dot, cast
+from theano.tensor import dot, cast, gt
+from theano.ifelse import ifelse
 from theano.printing import Print
 
 __all__ = ['MvNormal', 'Dirichlet', 'Multinomial', 'Wishart']
@@ -134,46 +135,116 @@ class Wishart(Continuous):
     distribution of the maximum-likelihood estimator (MLE) of the precision
     matrix of a multivariate normal distribution. If V=1, the distribution
     is identical to the chi-square distribution with n degrees of freedom.
+    
+    It is also the conjugate Prior for the Precision Matrix parameter of a
+    multivariate normal.
+
+    This follows the parameterization given in formula 290 and 291 on page 24
+    of [Kevin Murphy, Conjugate Bayesian Analysis of the Gaussian Distribution]
+    available at http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+    see that paper for further details and references.
+    
+    To create noninformative priors for covariance or precision matrices, see
+    Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    and Gelman, "Prior Distributions for variance parameters in hierarchical models"
+    ( https://faculty.washington.edu/jmiyamot/bayes/gelmana%20prior%20distributions%20f%20variance%20parameters%20i%20hierarchical%20mods.pdf )
+    as well as http://www.themattsimpson.com/2012/08/20/prior-distributions-for-covariance-matrices-the-scaled-inverse-wishart-prior/
+    and http://dahtah.wordpress.com/2012/03/07/why-an-inverse-wishart-prior-may-not-be-such-a-good-idea/
+    for a discussion of possible problems with priors of covariance and / or precision matrices
+    
+    :Parameters:
+      v : int
+        Degrees of freedom, v > p-1 .
+      S : ndarray
+        p x p positive definite matrix (scale matrix / scatter matrix). Can be interpreted as the scatter matrix of v p-variate observations.
 
-    For an alternative parameterization based on :math:`C=T{-1}` (Not yet implemented)
+    :Support:
+      X : matrix
+        Symmetric, positive definite.
+    """
+    def __init__(self, v, S, *args, **kwargs):
+        super(Wishart, self).__init__(*args, **kwargs)
+        self.v = v
+        self.p = p = S.shape[0]
+        self.S = S
+        
+        self.mean = v * S
+        self.mode = switch(1*(v > p + 1),
+                     (v - p - 1) * S,
+                      nan)
+        'TODO: We should pre-compute the following if the parameters are fixed'
+        self.invalid = theano.tensor.fill(S, nan) # Invalid result, if v<p
+        self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) + log(det(S)) * v / 2.,
+        self.inv_S = matrix_inverse(S)    
+
+    def logp(self, X):
+        v = self.v
+        p = self.p
+        Z = self.Z
+        inv_S = self.inv_S 
+        result = -Z + log(det(X)) * (v - p - 1) / 2. - trace(inv_S.dot(X)) / 2.
+        return ifelse(gt(v, p-1), result, self.invalid) 
+    
+    @staticmethod
+    def jeffreys_prior():
+        
 
-    .. math::
-        f(X \mid n, T) = \frac{{\mid T \mid}^{n/2}{\mid X \mid}^{(n-k-1)/2}}{2^{nk/2}
-        \Gamma_p(n/2)} \exp\left\{ -\frac{1}{2} Tr(TX) \right\}
 
-    where :math:`k` is the rank of X.
 
+class InverseWishart(Continuous):
+    """
+    The Inverse Wishart distribution is the conjugate prior
+    for the covariance matrix of a multivariate normal. It is also 
+    the distribution of the maximum-likelihood estimator (MLE) of the covariance
+    matrix of a multivariate normal distribution. 
+    
+    This follows the parameterization given in formula 296 and 297 on page 25
+    of [Kevin Murphy, Conjugate Bayesian Analysis of the Gaussian Distribution]
+    available at http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+    see that paper for further details and references.
+    
+    We parameterize with the scale matrix (scatter matrix) of possibly virtual observations, not with it's inverse.
+
+    To create noninformative priors for covariance or precision matrices, see
+    Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    and Gelman, "Prior Distributions for variance parameters in hierarchical models"
+    ( https://faculty.washington.edu/jmiyamot/bayes/gelmana%20prior%20distributions%20f%20variance%20parameters%20i%20hierarchical%20mods.pdf )
+    as well as http://www.themattsimpson.com/2012/08/20/prior-distributions-for-covariance-matrices-the-scaled-inverse-wishart-prior/
+    and http://dahtah.wordpress.com/2012/03/07/why-an-inverse-wishart-prior-may-not-be-such-a-good-idea/
+    for a discussion of possible problems with priors of covariance and / or precision matrices
+    
     :Parameters:
-      n : int
-        Degrees of freedom, > 0.
-      V : ndarray
-        p x p positive definite matrix
+      v : int
+        Degrees of freedom, v > p - 1
+      S : ndarray
+        p x p positive definite matrix (scale matrix / scatter matrix). Can be interpreted as the scatter matrix of v p-variate observations.
 
 
     :Support:
       X : matrix
         Symmetric, positive definite.
     """
-    def __init__(self, n, V, *args, **kwargs):
+    def __init__(self, v, S, *args, **kwargs):
         super(Wishart, self).__init__(*args, **kwargs)
-        self.n = n
-        self.p = p = V.shape[0]
-        self.V = V
-        self.mean = n * V
-        self.mode = switch(1*(n >= p + 1),
-                     (n - p - 1) * V,
-                      nan)
+        self.v = v
+        self.S = S
+        self.p = p = S.shape[0]
+        self.inv_S = matrix_inverse(S)
+        
+        'TODO: We should pre-compute the following if the parameters are fixed'   
+        self.invalid = theano.tensor.fill(S, nan) # Invalid result, if v<p
+        self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) - log(det(S)) * v / 2.,
+        self.mean = ifelse(gt(v, p-1), S / ( v - p - 1), self.invalid) 
 
     def logp(self, X):
-        n = self.n
+        v = self.v
         p = self.p
-        V = self.V
-
-        IVI = det(V)
-
-        return bound(
-            ((n - p - 1) * log(IVI) - trace(matrix_inverse(V).dot(X)) -
-             n * p * log(
-             2) - n * log(IVI) - 2 * multigammaln(p, n / 2)) / 2,
+        S = self.S
+        Z = self.Z
+        result = -Z + log(det(X)) * -(v + p + 1.) / 2. - trace(S.dot(matrix_inverse(X))) / 2.
+        return ifelse(gt(v, p-1), result, self.invalid) 
+    
+        
 
-             n > (p - 1))
diff --git a/pymc/distributions/special_priors.py b/pymc/distributions/special_priors.py
new file mode 100644
index 0000000000..9fcf902b2b
--- /dev/null
+++ b/pymc/distributions/special_priors.py
@@ -0,0 +1,40 @@
+
+from .multivariate import InverseWishart
+from .continuous import InverseGamma
+from sys import float_info
+import numpy as np
+from pymc.distributions.continuous import InverseGamma
+from theano.sandbox.linalg.ops import diag, psd
+
+def huang_wang_covariance_prior(name1, name2, d, observed_scatter_matrix=None, observed_sample_size=0, model=None):
+    '''
+    Construct a noninformative or informative prior for a mv normal covariance matrix
+    following Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    
+    If no observations are included, the resulting prior has an almost flat half-t distribution
+    over the variance of the variables, while efficiently having an 
+    uniform-prior (range [-1,1] for the correlation coefficients.
+    
+    This prior does not introduce a dependency between variance and correlation strength, as happens with
+    a simple InverseWishart prior.
+        
+    Arguments:
+        name1: Name for the Inverse Wishart distribution which will be created
+        name2: Name for the Inverse Gamma distribution which will be created as a prior for the diagonal elements of the inv_S param of the Inverse Wishart
+        d: Dimensionality, i.e. number of variables to create a joint covariance prior for.
+        observed_scatter_matrix: d x d dimensional scatter matrix of (possibly virtual) observations to combine the noninformative prior with, to form an informative prior (or posterior).
+        model: (optional) the model
+    Returns:
+    A tuple consisting of the covariance prior (an InverseWishart), and the hyperprior (InverseGamma) for the
+    diagonal elements of the InverseWishart inv_S parameter
+    '''
+
+    A = float_info.max / 4. # Large number
+    d_ones = np.ones(d, dtype=np.float64)
+    a_hyperprior = InverseGamma(name=name2, d_ones/2., d_ones / A, model=model)
+    S = diag(4. / a_hyperprior)
+    if (observed_scatter_matrix is not None):
+        S = S + observed_scatter_matrix
+    cov_prior = InverseWishart(name=name1, d+1+observed_sample_size, S, model=model)
+    return cov_prior, a_hyperprior
\ No newline at end of file
diff --git a/pymc/distributions/transforms.py b/pymc/distributions/transforms.py
index 6a24589459..a1ffe9a3d5 100644
--- a/pymc/distributions/transforms.py
+++ b/pymc/distributions/transforms.py
@@ -34,8 +34,14 @@ def __init__(self, dist, transform, *args, **kwargs):
         transform : Transform
         args, kwargs
             arguments to Distribution"""
-        forward = transform.forward 
+        forward = transform.forward
+        defaults = dist.defaults 
         try:
+            # If we don't do this, we will have problems
+            # with invalid mode values of transformed Dirichlet variables where one of the hyperparams is exactly 1
+            self.default_val = forward(dist.default())
+            defaults = ['default_val'] + defaults
+            
             testval = forward(dist.testval)
         except TypeError:
             testval = dist.testval
@@ -52,7 +58,7 @@ def __init__(self, dist, transform, *args, **kwargs):
 
         super(TransformedDistribution, self).__init__(v.shape.tag.test_value,
                 v.dtype, 
-                testval, dist.defaults, 
+                testval, defaults, 
                 *args, **kwargs)
 
 
diff --git a/pymc/examples/gmm_mixture_sampling.py b/pymc/examples/gmm_mixture_sampling.py
new file mode 100644
index 0000000000..2f49aa5e30
--- /dev/null
+++ b/pymc/examples/gmm_mixture_sampling.py
@@ -0,0 +1,135 @@
+import numpy as np
+from pymc import *
+
+from pymc.step_methods.metropolis_hastings import *
+from pymc.distributions.mixtures import *
+from pymc.distributions.special import gammaln
+from pymc.distributions.dist_math import *
+
+import theano
+import theano.tensor as T
+from theano.ifelse import ifelse as theano_ifelse
+import pandas
+import matplotlib.pylab as plt
+import numpy.linalg
+
+theano.config.mode = 'FAST_COMPILE'
+theano.config.compute_test_value = 'raise'
+
+''' 
+Example which demonstrates the use of the MvGaussianMixture as a flexible  (i.e. arbitrary) Prior
+and the usage of the MetropolisHastings step method with a custom Proposal 
+'''
+
+# Some helper methods
+def is_pos_def(x):
+    return np.all(np.linalg.eigvals(x) > 0)
+
+def tmin(a,b):
+        return T.switch(T.lt(a,b), a, b)
+    
+def tmax(a,b):
+        return T.switch(T.gt(a,b), a, b)
+
+''' We construct a number of valid 
+parameters for the multivariate normal.
+
+In order to ensure that the covariance matrices are positive semidefinite, they are risen to the (matrix-) power of two.
+Precision matrices are obtained by inverting the covariance matrices.
+'''
+def create_model(with_observations=False):
+
+    # First, we create some parameters for a Multivariate Normal
+    mu0 = np.ones((2), dtype=np.float64)
+    cov0 = np.eye(2, dtype=np.float64) * 0.2
+    tau0 = numpy.linalg.inv(cov0)
+    
+    mu1 = mu0+5.5
+    cov1 = cov0.copy() * 1.4
+    cov1[0,1] = 0.8
+    cov1[1,0] = 0.8
+    cov1 = np.linalg.matrix_power(cov1,2)
+    tau1 = numpy.linalg.inv(cov1)
+    
+    mu2 = mu0-2.5
+    cov2 = cov0.copy() * 1.4
+    cov2[0,1] = 0.8
+    cov2[1,0] = 0.8
+    cov2 = np.linalg.matrix_power(cov2,2)
+    tau2 = numpy.linalg.inv(cov2)
+    
+    mu3 = mu0*-0.5 + 2.0*mu2
+    cov3 = cov0.copy() * 0.9
+    cov3[0,1] = -0.8
+    cov3[1,0] = -0.8
+    cov3 = np.linalg.matrix_power(cov3,2)
+    tau3 = numpy.linalg.inv(cov3)
+    
+    m2 = np.array([3.0])
+    aval = np.array([4, 3, 2., 1.])
+    a = constant(aval)
+    
+    model = Model()
+    
+    with model:
+        k = 4    
+        p, p_m1 = model.TransformedVar(
+            'p', Dirichlet.dist(a, shape=k),
+          simplextransform)
+    
+        c = Categorical('c', p)
+        gmm = MvGaussianMixture('gmm', shape=mu0.shape, cat_var=c, mus=[mu0,mu1,mu2,mu3], taus=[tau0, tau1,tau2,tau3])
+        ' Now, we ensure these arbitrary values sum to one. What do we get ? A nice probability which we can, in turn, sample from (or observe)'
+        fac = T.constant( np.array([10.0], dtype=np.float64))
+        gmm_p = Deterministic('gmm_p', (tmax(tmin(gmm[0], fac) / fac, T.constant(np.array([0.0], dtype=np.float64)))))
+        pbeta = Beta('pbeta', 1,1)
+        if (with_observations):
+            result = Bernoulli('result', p=gmm_p[0], observed=np.array([1,0,1], dtype=np.bool))
+        else:
+            result = Bernoulli('result', p=gmm_p[0]) 
+        ' Try this with a Metropolis instance, and watch it fail ..'
+        step = MetropolisHastings(vars=model.vars, proposals=[GMMProposal(gmm)])
+    return model, step
+
+def plot_scatter_matrix(title, tr, fig=None):
+    if (fig is None):
+        fig = plt.Figure()
+    t6 = pandas.Series(tr['c'])
+    t8 = pandas.Series(tr['gmm'][:,0])
+    t9 = pandas.Series(tr['gmm'][:,1])
+    t10 = pandas.Series(tr['gmm_p'][:,0])
+    t11 = pandas.Series(tr['pbeta'])
+    df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11})
+    pandas.scatter_matrix(df)
+    plt.title(title)
+    return fig
+
+def plot_p_prior_vs_posterior(prior_trace, posterior_trace, fig=None):
+    if (fig is None):
+        fig = plt.Figure()
+    pr = pandas.Series(prior_trace['gmm_p'][:,0])
+    po = pandas.Series(posterior_trace['gmm_p'][:,0])
+    plt.hist(pr, bins=40, range=(-0.1, 1.1), color='b', alpha=0.5, label='Prior')
+    plt.hist(po, bins=40, range=(-0.1, 1.1), color='g', alpha=0.5, label='Posterior')
+    plt.legend()
+    plt.show()
+
+def run(n=10000):
+    m1, step1 = create_model(False)
+    m2, step2 = create_model(True)
+    with m1:
+        trace1 = sample(n, step1)
+    with m2:
+        trace2 = sample(n, step2)
+    fig1 = plot_scatter_matrix('Prior Scatter Matrix', trace1)
+    plt.show()
+    fig2 = plot_scatter_matrix('Posterior Scatter Matrix', trace2)
+    plt.show()
+    plot_p_prior_vs_posterior(trace1, trace2)
+    plt.show()
+    
+if __name__ == '__main__':
+    run()
+    print "Done"
+    
+
diff --git a/pymc/examples/mh_example.py b/pymc/examples/mh_example.py
new file mode 100644
index 0000000000..f04676c2a6
--- /dev/null
+++ b/pymc/examples/mh_example.py
@@ -0,0 +1,43 @@
+import numpy as np
+from pymc import *
+from pymc.step_methods.metropolis_hastings import *
+import theano
+import theano.tensor as T
+import pandas
+import matplotlib.pylab as plt
+#theano.config.mode = 'DebugMode'
+
+model = Model()
+with model:
+
+    k = 5
+    a = constant(np.array([2, 6., 4, 2, 2]))
+    pa = a / T.sum(a)
+
+    p, p_m1 = model.TransformedVar(
+        'p', Dirichlet.dist(a, shape=k),
+       simplextransform)
+
+    c = Categorical('c', p)
+
+def run(n=50000):
+    if n == "short":
+        n = 50
+    with model:
+        ' Try this with a Metropolis instance, and watch it fail ..'
+        step = MetropolisHastings()
+        trace = sample(n, step)
+    return trace
+if __name__ == '__main__':
+    tr = run()
+    t1 = pandas.Series(tr['p'][:,0])
+    t2 = pandas.Series(tr['p'][:,1])
+    t3 = pandas.Series(tr['p'][:,2])
+    t4 = pandas.Series(tr['p'][:,3])
+    t5 = pandas.Series(tr['p'][:,4])
+    t6 = pandas.Series(tr['c'])
+    df = pandas.DataFrame({'a' : t1,'b' : t2, 'c' : t3, 'd' : t4, 'cat' : t6})
+    pandas.scatter_matrix(df)
+    plt.show()
+
+
diff --git a/pymc/step_methods/metropolis_hastings.py b/pymc/step_methods/metropolis_hastings.py
new file mode 100644
index 0000000000..8623dc0b23
--- /dev/null
+++ b/pymc/step_methods/metropolis_hastings.py
@@ -0,0 +1,282 @@
+from numpy.linalg import cholesky
+
+from ..core import *
+from ..model import FreeRV
+from .quadpotential import quad_potential
+
+from .arraystep import *
+from numpy.random import normal, standard_cauchy, standard_exponential, poisson, random, uniform
+from numpy import round, exp, copy, where, log, isfinite
+import numpy as np
+from ..distributions.discrete import Categorical
+import theano.tensor as T
+import theano
+from metropolis import *
+
+__all__ = ['MetropolisHastings', 'GenericProposal', 'ProposalAdapter']
+
+'''
+More general version of Metropolis, which allows to combine different proposal distributions for
+different sets of variables and then do a single accept / reject.
+
+Can also handle asymmetric proposals, i.e. it's an implementation of the Metropolis Hastings
+algorithm, not just the Metropolis algorithm.
+
+The main purpose of this class is to allow for tailored proposal distributions for specialized
+applications, such as efficiently sampling from a mixture model.
+
+@author Kai Londenberg (Kai.Londenberg@gmail.com) 
+'''
+
+class GenericProposal(object):
+    
+    def __init__(self, vars):
+        '''
+        Constructor for GenericProposal. To be called by subclass constructors.
+        Arguments:
+            vars: List of variables (FreeRV instances) this GenericProposal is responsible for.
+        '''
+        for v in vars:
+            assert isinstance(v, FreeRV)
+        self.vars = vars
+    
+    '''
+    Asymmetric proposal distribution. 
+    Has to be able to calculate move propabilities 
+    '''
+    def proposal_logp_difference(self):
+        ''' Return log-probability difference of move from one point to another
+        for the last proposed move
+        
+        Returns 0.0 by default, which is true for all symmetric proposal distributions
+        '''
+        return 0.0   
+        
+    def propose_move(self, from_point, point):
+        ''' 
+        Propose a move: Return a new point proposal
+        Destructively writes to the given point, destroying the old values in it
+        Arguments:
+            old_point: State before proposals. Calculate previous logp with respect to this point
+            point: New proposal. Update it, and calculate new logp with respect to the updated version of this point.
+        '''
+        raise NotImplementedError("Called abstract method")
+    
+    def tune_stepsize(self, stepsize_factor=1.0):
+        '''
+        Tune proposal distribution to increase / decrease the stepsize of the proposals by the given factor (if possible) 
+        '''
+        pass
+
+
+class ProposalAdapter(GenericProposal):
+    
+    def __init__(self, vars, proposal_dist=NormalProposal, scale=None):
+        super(ProposalAdapter, self).__init__(vars)
+        self.ordering = ArrayOrdering(vars)
+        if scale is None:
+            scale = np.ones(sum(v.dsize for v in vars))
+        self.proposal = proposal_dist(scale)
+        self.discrete = np.array([v.dtype in discrete_types for v in vars])
+        self.stepsize_factor = 1.0
+        
+
+    def propose_move(self, from_point, point):
+        i = 0
+        delta = np.atleast_1d( self.proposal() * self.stepsize_factor)
+        for var, slc, shp in self.ordering.vmap:
+            dvar = delta[slc] 
+            if self.discrete[i]:
+                dvar = round(delta[slc], 0).astype(int)               
+            point[var] = point[var] + np.reshape(dvar, shp)
+            i += 1
+        return point
+
+    def tune_stepsize(self, stepsize_factor=1.0):
+        self.stepsize_factor = stepsize_factor
+        
+class CategoricalProposal(GenericProposal):
+    
+    def __init__(self, vars, model=None):
+        super(CategoricalProposal, self).__init__(vars)
+        model = modelcontext(model)
+        self.vars = vars
+        varidx = T.iscalar('varidx')
+        logpt_elems = []
+        self.paramfuncs = []
+        self._proposal_logp_difference = 0.0
+        
+        for i, v in enumerate(vars):
+            assert isinstance(v.distribution, Categorical)
+            self.paramfuncs.append(model.fastfn(T.as_tensor_variable(v.distribution.p)))
+            logpt_elems.append(v.distribution.logp(v))
+        
+        self.logpfunc = model.fastfn(T.add(*logpt_elems))
+        
+    def propose_move(self, from_point, point):
+        move_logp = 0.0
+        for i, v in enumerate(self.vars):
+            weights = self.paramfuncs[i](point)
+            oldvalue = point[v.name]
+            newvalue = np.random.choice(len(weights),1, p=weights)[0]
+            'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
+            if (oldvalue!=newvalue):
+                move_logp += log(weights[oldvalue]) - log(weights[newvalue])  
+                point[v.name] = newvalue
+            
+        self._proposal_logp_difference = move_logp
+        return point
+
+    def proposal_logp_difference(self):
+        return self._proposal_logp_difference
+    
+
+class MetropolisHastings(object):
+    
+    """
+    Metropolis-Hastings sampling step
+
+    Parameters
+    ----------
+    vars : list
+        List of variables for sampler. 
+    proposals : array of GenericProposal instances. For variables not covered by these, but present in "vars" list above, the constructor will choose Proposal distributions automatically
+    scaling : scalar or array
+        Initial scale factor for proposal. Defaults to 1.
+    tune : bool
+        Flag for tuning. Defaults to True.
+    model : PyMC Model
+        Optional model for sampling step. Defaults to None (taken from context).
+
+    """
+    def __init__(self, vars=None, proposals=None, scaling=1., tune=True, tune_interval=100, model=None):
+        model = modelcontext(model)
+        self.model = model
+        if ((vars is None) and (proposals is None)):
+            vars = model.vars
+            
+        covered_vars = set()
+        if (proposals is not None):
+            for p in proposals:
+                for v in p.vars:
+                    covered_vars.add(v)
+        if (vars is not None):
+            vset = set(vars) | set(covered_vars)
+        else:
+            vset = set(covered_vars)
+        self.vars = list(vset)
+        remaining_vars = vset-covered_vars
+        if (proposals is None):
+            proposals = []
+        if (len(remaining_vars)>0):
+            cvars = []
+            ovars = []
+            for v in vars: # We don't iterate over remaining vars, because the ordering of the vars might be important somehow..
+                if (v in covered_vars):
+                    continue
+                if (isinstance(v.distribution, Categorical)):
+                    cvars.append(v)
+                else:
+                    ovars.append(v)
+            # Make sure we sample the categorical ones first. Others might switch their regime accordingly ..
+            if (len(cvars)>0):
+                proposals = [ CategoricalProposal(cvars)] + proposals
+            if (len(ovars)>0):
+                proposals.append(ProposalAdapter(ovars, NormalProposal))
+        self.proposals = proposals
+        vars = self.vars
+        for p in proposals:
+            p.tune_stepsize(scaling)
+            
+        self.scaling = scaling
+        self.tune = tune
+        self.tune_interval = tune_interval
+        self.steps_until_tune = tune_interval
+        self.accepted = 0
+        self.fastlogp = self.model.fastlogp
+        self.oldlogp = None
+        self.acceptance_rate = []
+    
+    def step(self, point):
+        self._ensure_tuned()
+        proposal = point.copy()
+        proposal_symmetry = 0.0
+        for prop in self.proposals:
+            proposal = prop.propose_move(point, proposal)
+            proposal_symmetry += prop.proposal_logp_difference()
+
+        
+        # Log Probability of old state
+        if (self.oldlogp is None):
+            self.oldlogp = self.fastlogp(point)
+        
+        # Log-Probability of proposed state
+        proposal_logp = self.fastlogp(proposal)
+        
+        # Log probability of move, takes proposal symmetry into account to ensure detailed balance
+        move_logp = proposal_symmetry + proposal_logp - self.oldlogp
+        
+        # Accept / Reject
+        if (move_logp==np.nan):
+            print "Warning: Invalid proposal in MetropolisHastings.step(..) (logp is not a number)"
+        if move_logp!=np.nan and log(uniform()) < move_logp:
+            # Accept proposed value
+            newpoint = proposal
+            self.oldlogp = proposal_logp
+        else:
+            # Reject proposed value
+            newpoint = point#
+        
+        self.steps_until_tune -= 1
+
+        return newpoint
+    
+    def _ensure_tuned(self):
+        if self.tune and not self.steps_until_tune:
+            # Tune scaling parameter
+            acc_rate = self.accepted / float(self.tune_interval)
+            self.acceptance_rate.append(acc_rate)
+            self.scaling = mh_tune(self.scaling, acc_rate)
+            for p in self.proposals:
+                p.tune_stepsize(self.scaling)
+            # Reset counter
+            self.steps_until_tune = self.tune_interval
+            self.accepted = 0
+
+def mh_tune(scale, acc_rate):
+    """
+    Tunes the scaling parameter for the proposal distribution
+    according to the acceptance rate over the last tune_interval:
+
+    Rate    Variance adaptation
+    ----    -------------------
+    <0.001        x 0.1
+    <0.05         x 0.5
+    <0.2          x 0.9
+    >0.5          x 1.1
+    >0.75         x 2
+    >0.95         x 10
+
+    """
+    #print "Acceptance rate: %f - scale: %f" % (acc_rate, scale)
+    # Switch statement
+    if acc_rate < 0.001:
+        # reduce by 90 percent
+        scale *= 0.1
+    elif acc_rate < 0.05:
+        # reduce by 50 percent
+        scale *= 0.5
+    elif acc_rate < 0.2:
+        # reduce by ten percent
+        scale *= 0.9
+    elif acc_rate > 0.95:
+        # increase by factor of ten
+        scale *= 10.0
+    elif acc_rate > 0.75:
+        # increase by double
+        scale *= 2.0
+    elif acc_rate > 0.5:
+        # increase by ten percent
+        scale *= 1.1
+
+    return scale
\ No newline at end of file