From 872370ddc583c7cf06eba8958337603258490101 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 14:21:23 +0000
Subject: [PATCH 01/14] Changed bound() function so that it efficiently deals
 with scalars and vector inputs without calculating unneccessary and possibly
 invalid expressions

---
 pymc/distributions/dist_math.py | 51 +++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index 08080e5572..8a80dab24e 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -13,15 +13,44 @@
     zeros_like, ones, ones_like,
     concatenate, constant, argmax)
 
+import theano
 
-from numpy import pi, inf, nan
-from .special import gammaln, multigammaln
 
+from numpy import pi, inf, nan, float64, array
+from .special import gammaln, multigammaln
+from theano.ifelse import ifelse
 from theano.printing import Print
 from .distribution import *
 
+def guess_is_scalar(c):
+    bcast = None
+    if (isinstance(c, t.TensorType)):
+        return c.broadcastable == ()
+    
+    if (isinstance(c, t.Variable) or isinstance(c, t.Constant)):
+        return c.type.broadcastable == ()
+    
+    tt = t.as_tensor_variable(c)
+    return tt.type.broadcastable == ()
 
 def bound(logp, *conditions):
+    if (guess_is_scalar(logp)):
+        return bound_scalar(logp, *conditions)
+    seqs = [logp]
+    non_seqs = []
+    for i, c in enumerate(conditions):
+        if (guess_is_scalar(c)):
+            non_seqs.append(c)
+        else:
+            seqs.append(c)
+    logp_vec, updates = theano.scan(fn=bound_scalar,
+                                      outputs_info=None,
+                                      sequences=seqs, non_sequences=non_seqs)
+    return logp_vec
+
+impossible = constant(-inf, dtype=float64) # We re-use that constant
+
+def bound_scalar(logp, *conditions):
     """
     Bounds a log probability density with several conditions
 
@@ -34,10 +63,26 @@ def bound(logp, *conditions):
     -------
     logp if all conditions are true
     -inf if some are false
+    """        
+    cond = alltrue(conditions)
+    return ifelse(cond, logp, impossible)
+
+  
+def bound_switched(logp, *conditions):
     """
+    Bounds a log probability density with several conditions
 
-    return switch(alltrue(conditions), logp, -inf)
+    Parameters
+    ----------
+    logp : float
+    *conditionss : booleans
 
+    Returns
+    -------
+    logp if all conditions are true
+    -inf if some are false
+    """
+    return switch(alltrue(conditions), logp, impossible)
 
 def alltrue(vals):
     ret = 1

From 9bbd3d6249ded2208c95f7ba39cf5b8cd0678329 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 14:25:09 +0000
Subject: [PATCH 02/14] Added full Metropolis Hastings step method for
 arbitrary (asymmetric) proposal distributions. Allows to create efficient
 custom samplers.

---
 pymc/examples/metropolis_hastings.py     |  43 ++++
 pymc/step_methods/metropolis_hastings.py | 246 +++++++++++++++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 pymc/examples/metropolis_hastings.py
 create mode 100644 pymc/step_methods/metropolis_hastings.py

diff --git a/pymc/examples/metropolis_hastings.py b/pymc/examples/metropolis_hastings.py
new file mode 100644
index 0000000000..3c94723054
--- /dev/null
+++ b/pymc/examples/metropolis_hastings.py
@@ -0,0 +1,43 @@
+import numpy as np
+from pymc import *
+from pymc.step_methods.metropolis_hastings import *
+import theano
+import theano.tensor as T
+import pandas
+import matplotlib.pylab as plt
+#theano.config.mode = 'DebugMode'
+
+model = Model()
+with model:
+
+    k = 5
+    a = constant(np.array([2, 6., 4, 2, 2]))
+    pa = a / T.sum(a)
+
+    p, p_m1 = model.TransformedVar(
+        'p', Dirichlet.dist(a, shape=k),
+       simplextransform)
+
+    c = Categorical('c', pa)
+
+def run(n=50000):
+    if n == "short":
+        n = 50
+    with model:
+        ' Try this with a Metropolis instance, and watch it fail ..'
+        step = MetropolisHastings()
+        trace = sample(n, step)
+    return trace
+if __name__ == '__main__':
+    tr = run()
+    t1 = pandas.Series(tr['p'][:,0])
+    t2 = pandas.Series(tr['p'][:,1])
+    t3 = pandas.Series(tr['p'][:,2])
+    t4 = pandas.Series(tr['p'][:,3])
+    t5 = pandas.Series(tr['p'][:,4])
+    t6 = pandas.Series(tr['c'])
+    df = pandas.DataFrame({'a' : t1,'b' : t2, 'c' : t3, 'd' : t4, 'cat' : t6})
+    pandas.scatter_matrix(df)
+    plt.show()
+
+
diff --git a/pymc/step_methods/metropolis_hastings.py b/pymc/step_methods/metropolis_hastings.py
new file mode 100644
index 0000000000..48a62b09ca
--- /dev/null
+++ b/pymc/step_methods/metropolis_hastings.py
@@ -0,0 +1,246 @@
+from numpy.linalg import cholesky
+
+from ..core import *
+from .quadpotential import quad_potential
+
+from .arraystep import *
+from numpy.random import normal, standard_cauchy, standard_exponential, poisson, random, uniform
+from numpy import round, exp, copy, where, log, isfinite
+import numpy as np
+from ..distributions.discrete import Categorical
+import theano.tensor as T
+import theano
+from metropolis import *
+
+__all__ = ['MetropolisHastings', 'GenericProposal', 'ProposalAdapter']
+
+'''
+More general version of Metropolis, which allows to combine different proposal distributions for
+different sets of variables and then do a single accept / reject.
+
+Can also handle asymmetric proposals, i.e. it's an implementation of the Metropolis Hastings
+algorithm, not just the Metropolis algorithm.
+
+The main purpose of this class is to allow for tailored proposal distributions for specialized
+applications, such as efficiently sampling from a mixture model.
+
+@author Kai Londenberg (Kai.Londenberg@gmail.com) 
+'''
+
+class GenericProposal(object):
+    
+    '''
+    Asymmetric proposal distribution. 
+    Has to be able to calculate move propabilities 
+    '''
+    def proposal_logp_difference(self):
+        ''' Return log-probability difference of move from one point to another
+        for the last proposed move
+        
+        Returns 0.0 by default, which is true for all symmetric proposal distributions
+        '''
+        return 0.0   
+        
+    def propose_move(self, point):
+        ''' 
+        Propose a move: Return a new point proposal
+        Destructively writes to the given point, destroying the old values in it
+        '''
+        raise NotImplementedError("Called abstract method")
+    
+    def tune_stepsize(self, stepsize_factor=1.0):
+        '''
+        Tune proposal distribution to increase / decrease the stepsize of the proposals by the given factor (if possible) 
+        '''
+        pass
+
+
+class ProposalAdapter(GenericProposal):
+    
+    def __init__(self, vars, proposal_dist=NormalProposal, scale=None):
+        self.vars = vars
+        self.ordering = ArrayOrdering(vars)
+        if scale is None:
+            scale = np.ones(sum(v.dsize for v in vars))
+        self.proposal = proposal_dist(scale)
+        self.discrete = np.array([v.dtype in discrete_types for v in vars])
+        self.stepsize_factor = 1.0
+
+    def propose_move(self, point):
+        i = 0
+        delta = np.atleast_1d( self.proposal() * self.stepsize_factor)
+        for var, slc, shp in self.ordering.vmap:
+            dvar = delta[slc] 
+            if self.discrete[i]:
+                dvar = round(delta[slc], 0).astype(int)               
+            point[var] = point[var] + np.reshape(dvar, shp)
+            i += 1
+        return point
+
+    def tune_stepsize(self, stepsize_factor=1.0):
+        self.stepsize_factor = stepsize_factor
+        
+class CategoricalProposal(GenericProposal):
+    
+    def __init__(self, vars, model=None):
+        model = modelcontext(model)
+        self.vars = vars
+        varidx = T.iscalar('varidx')
+        logpt_elems = []
+        self.paramfuncs = []
+        self._proposal_logp_difference = 0.0
+        
+        for i, v in enumerate(vars):
+            assert isinstance(v.distribution, Categorical)
+            self.paramfuncs.append(model.fastfn(v.distribution.p))
+            logpt_elems.append(v.distribution.logp(v))
+        
+        self.logpfunc = model.fastfn(T.add(*logpt_elems))
+        
+    def propose_move(self, point):
+        move_logp = 0.0
+        for i, v in enumerate(self.vars):
+            weights = self.paramfuncs[i](point)
+            
+            oldvalue = point[v.name]
+            newvalue = np.random.choice(len(weights),1, p=weights)[0]
+            'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
+            move_logp += log(weights[oldvalue]) - log(weights[newvalue])  
+            point[v.name] = newvalue
+            
+        self._proposal_logp_difference = move_logp
+        return point
+
+    def proposal_logp_difference(self):
+        return self._proposal_logp_difference
+    
+
+class MetropolisHastings(object):
+    """
+    Metropolis-Hastings sampling step
+
+    Parameters
+    ----------
+    vars : list
+        List of variables for sampler
+    proposals : array of GenericProposal instances
+    scaling : scalar or array
+        Initial scale factor for proposal. Defaults to 1.
+    tune : bool
+        Flag for tuning. Defaults to True.
+    model : PyMC Model
+        Optional model for sampling step. Defaults to None (taken from context).
+
+    """
+    def __init__(self, proposals=None, scaling=1., tune=True, tune_interval=100, model=None):
+        model = modelcontext(model)
+        self.model = model    
+        if (proposals is None):
+            cvars = []
+            ovars = []
+            for v in model.vars:
+                if (isinstance(v.distribution, Categorical)):
+                    cvars.append(v)
+                else:
+                    ovars.append(v)
+            if (len(cvars)>0):
+                proposals = [ CategoricalProposal(cvars), ProposalAdapter(ovars, NormalProposal) ]
+            else:
+                proposals = [ ProposalAdapter(ovars, NormalProposal) ]
+        self.proposals = proposals
+        var_set = set()
+        for p in proposals:
+            p.tune_stepsize(scaling)
+            for v in p.vars:
+                var_set.add(v)
+        vars = list(var_set)
+        
+        self.scaling = scaling
+        self.tune = tune
+        self.tune_interval = tune_interval
+        self.steps_until_tune = tune_interval
+        self.accepted = 0
+        self.fastlogp = self.model.fastlogp
+        self.oldlogp = None
+    
+    def step(self, point):
+        self._ensure_tuned()
+        proposal = point.copy()
+        proposal_symmetry = 0.0
+        for prop in self.proposals:
+            proposal = prop.propose_move(proposal)
+            proposal_symmetry += prop.proposal_logp_difference()
+
+        
+        # Log Probability of old state
+        if (self.oldlogp is None):
+            self.oldlogp = self.fastlogp(point)
+        
+        # Log-Probability of proposed state
+        proposal_logp = self.fastlogp(proposal)
+        
+        # Log probability of move, takes proposal symmetry into account to ensure detailed balance
+        move_logp = proposal_symmetry + proposal_logp - self.oldlogp
+        
+        # Accept / Reject
+        if isfinite(move_logp) and log(uniform()) < move_logp:
+            # Accept proposed value
+            newpoint = proposal
+            self.oldlogp = proposal_logp
+            self.accepted += 1
+        else:
+            # Reject proposed value
+            newpoint = point#
+        
+        self.steps_until_tune -= 1
+
+        return newpoint
+    
+    def _ensure_tuned(self):
+        if self.tune and not self.steps_until_tune:
+            # Tune scaling parameter
+            self.scaling = mh_tune(
+                self.scaling, self.accepted / float(self.tune_interval))
+            for p in self.proposals:
+                p.tune_stepsize(self.scaling)
+            # Reset counter
+            self.steps_until_tune = self.tune_interval
+            self.accepted = 0
+
+def mh_tune(scale, acc_rate):
+    """
+    Tunes the scaling parameter for the proposal distribution
+    according to the acceptance rate over the last tune_interval:
+
+    Rate    Variance adaptation
+    ----    -------------------
+    <0.001        x 0.1
+    <0.05         x 0.5
+    <0.2          x 0.9
+    >0.5          x 1.1
+    >0.75         x 2
+    >0.95         x 10
+
+    """
+    print "Acceptance rate: %f - scale: %f" % (acc_rate, scale)
+    # Switch statement
+    if acc_rate < 0.001:
+        # reduce by 90 percent
+        scale *= 0.1
+    elif acc_rate < 0.05:
+        # reduce by 50 percent
+        scale *= 0.5
+    elif acc_rate < 0.2:
+        # reduce by ten percent
+        scale *= 0.9
+    elif acc_rate > 0.95:
+        # increase by factor of ten
+        scale *= 10.0
+    elif acc_rate > 0.75:
+        # increase by double
+        scale *= 2.0
+    elif acc_rate > 0.5:
+        # increase by ten percent
+        scale *= 1.1
+
+    return scale
\ No newline at end of file

From 9db4738e2b2bfe1fd57a3102c808bc11fdf8dc3d Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 14:25:20 +0000
Subject: [PATCH 03/14] Added Gaussian Mixture distribution, which can be
 efficiently sampled from using Metropolis Hastings and a custom Proposal
 Distribution

---
 pymc/distributions/mixtures.py | 94 ++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 pymc/distributions/mixtures.py

diff --git a/pymc/distributions/mixtures.py b/pymc/distributions/mixtures.py
new file mode 100644
index 0000000000..13bdd19565
--- /dev/null
+++ b/pymc/distributions/mixtures.py
@@ -0,0 +1,94 @@
+
+
+from pymc import *
+
+from numpy.random import normal
+import numpy as np
+import pylab as pl
+from itertools import product
+import theano
+import theano.tensor as T
+from theano.ifelse import ifelse
+from numpy import pi
+from numpy.random import uniform, multivariate_normal
+from pymc.model import FreeRV, ObservedRV
+import numpy as np
+import numpy.linalg as np_linalg
+from pymc.step_methods.metropolis_hastings import GenericProposal
+
+class MvGaussianMixture(Continuous):
+    
+    def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
+        '''
+        Creates a continuous mixture distribution which can be efficiently evaluated and sampled from.
+         
+        Args:
+            shape: Shape of the distribution. All kernels need to have this same shape as well.
+            weights: weight vector (may be a theano shared variable or expression. Must be able to evaluate this in the context of the model).
+            kernels: list of mixture component distributions. These have to be MixtureKernel instances (for example MvNormalKernel ) of same shape
+            testval: Test value for logp computations
+        '''
+        super(MvGaussianMixture, self).__init__(*args, shape=shape, **kwargs)
+        assert isinstance(cat_var, FreeRV)
+        assert isinstance(cat_var.distribution, Categorical)
+        self.cat_var = cat_var
+        model = modelcontext(model)
+        weights = cat_var.distribution.p
+        self.weights = weights
+        self.mus = mus
+        self.taus = taus
+        self.mu_t = T.stacklists(mus)
+        self.tau_t = T.stacklists(taus)
+        self.shape = shape
+        self.testval = np.zeros(self.shape, self.dtype)
+        self.param_fn = model.fastfn([self.mu_t[self.cat_var], self.tau_t[self.cat_var]])
+        self.last_cov_value = None
+        self.last_tau_value = None
+        
+    def logp(self, value):
+        mu = self.mu_t[self.cat_var]
+        tau = self.tau_t[self.cat_var]
+
+        delta = value - mu
+        k = tau.shape[0]
+
+        return 1/2. * (-k * log(2*pi) + log(det(tau)) - dot(delta.T, dot(tau, delta)))
+    
+    def draw_mvn(self, mu, tau):
+        if (tau==self.last_tau_value):
+            mcov = self.last_cov_value
+        else:
+            mcov = np_linalg.inv(tau)
+            self.last_cov_value = mcov
+        
+        return np.random.multivariate_normal(mean=mu, mcov=tau)
+    
+    def draw(self, point):
+        mu, tau = self.param_fn(point)
+        if (tau==self.last_tau_value):
+            mcov = self.last_cov_value
+        else:
+            mcov = np_linalg.inv(tau)
+            self.last_cov_value = mcov
+        return self.draw_mvn
+    
+
+class GMMProposal(GenericProposal):
+    
+    def __init__(self, gmm_var, model=None):
+        assert isinstance(gmm_var.distribution, MvGaussianMixture)
+        model = modelcontext(model)
+        self.gmm_var = gmm_var
+        self.logpfunc = model.fastfn(gmm_var.distribution.logp)
+        self._proposal_logp_difference = 0.0
+        
+    def propose_move(self, point):
+        old_logp = self.logpfunc(point)
+        point[self.gmm_var.name] = self.gmm_var.draw(point)
+        new_logp = self.logpfunc(point)
+        self._proposal_logp_difference = old_logp-new_logp
+        return point
+
+    def proposal_logp_difference(self):
+        return self._proposal_logp_difference
+

From d74ef2f62a4aa184581550198bbb107f02032ab2 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 14:34:08 +0000
Subject: [PATCH 04/14] Renamed Metropolis Hastings example

---
 pymc/examples/{metropolis_hastings.py => mh_example.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pymc/examples/{metropolis_hastings.py => mh_example.py} (100%)

diff --git a/pymc/examples/metropolis_hastings.py b/pymc/examples/mh_example.py
similarity index 100%
rename from pymc/examples/metropolis_hastings.py
rename to pymc/examples/mh_example.py

From 7bf3dca1f45417a8658d080b691a604eef0c8d5c Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 17:29:55 +0000
Subject: [PATCH 05/14] Fixed problem when sampling from transformed Dirichlet
 with hyperprior containing 1's.

---
 pymc/distributions/dist_math.py  |  5 +++++
 pymc/distributions/transforms.py | 10 ++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index 8a80dab24e..f73426469e 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -97,6 +97,11 @@ def logpow(x, m):
     """
     return switch(eq(x, 0) & eq(m, 0), 0, m * log(x))
 
+def logpow_relaxed(x, m):
+    """
+    Calculates log(x**m), unless m==0 or x==0 in which case it will return 0 
+    """
+    return switch(eq(x, 0) | eq(m, 0), 0, m * log(x))
 
 def factln(n):
     return gammaln(n + 1)
diff --git a/pymc/distributions/transforms.py b/pymc/distributions/transforms.py
index 6a24589459..a1ffe9a3d5 100644
--- a/pymc/distributions/transforms.py
+++ b/pymc/distributions/transforms.py
@@ -34,8 +34,14 @@ def __init__(self, dist, transform, *args, **kwargs):
         transform : Transform
         args, kwargs
             arguments to Distribution"""
-        forward = transform.forward 
+        forward = transform.forward
+        defaults = dist.defaults 
         try:
+            # If we don't do this, we will have problems
+            # with invalid mode values of transformed Dirichlet variables where one of the hyperparams is exactly 1
+            self.default_val = forward(dist.default())
+            defaults = ['default_val'] + defaults
+            
             testval = forward(dist.testval)
         except TypeError:
             testval = dist.testval
@@ -52,7 +58,7 @@ def __init__(self, dist, transform, *args, **kwargs):
 
         super(TransformedDistribution, self).__init__(v.shape.tag.test_value,
                 v.dtype, 
-                testval, dist.defaults, 
+                testval, defaults, 
                 *args, **kwargs)
 
 

From b87c4cc5abd56993da6ceb0821cac073e74c1eeb Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Thu, 5 Jun 2014 17:31:41 +0000
Subject: [PATCH 06/14] Added an example which uses a GMM as a prior for
 another distribution. Fixed some problems that occurred.

---
 pymc/distributions/mixtures.py           |  15 ++++
 pymc/examples/gmm_mixture_sampling.py    | 104 +++++++++++++++++++++++
 pymc/examples/mh_example.py              |   2 +-
 pymc/step_methods/metropolis_hastings.py |  17 ++--
 4 files changed, 130 insertions(+), 8 deletions(-)
 create mode 100644 pymc/examples/gmm_mixture_sampling.py

diff --git a/pymc/distributions/mixtures.py b/pymc/distributions/mixtures.py
index 13bdd19565..882ec0e6bb 100644
--- a/pymc/distributions/mixtures.py
+++ b/pymc/distributions/mixtures.py
@@ -16,6 +16,21 @@
 import numpy.linalg as np_linalg
 from pymc.step_methods.metropolis_hastings import GenericProposal
 
+"""
+Multivariate Mixture distributions. 
+
+These distributions have a lot of potential uses. For one thing: They can be used to approximate every other density function,
+and there are a lot of algorithms for efficiently creating density estimates (i.e. Kernel Density Estimates (KDE),
+Gaussian Mixture Model learning and others.
+
+Once we have learned such a density, we could use it as a prior density. To do that, we need to be able to
+efficiently sample from it. Luckily, that's simple and efficient. Unless you use the wrong proposal distribution, of course.
+
+That's where these classes come in handy.
+
+@author Kai Londenberg ( Kai.Londenberg@gmail.com )    
+
+"""
 class MvGaussianMixture(Continuous):
     
     def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
diff --git a/pymc/examples/gmm_mixture_sampling.py b/pymc/examples/gmm_mixture_sampling.py
new file mode 100644
index 0000000000..c62febb2bd
--- /dev/null
+++ b/pymc/examples/gmm_mixture_sampling.py
@@ -0,0 +1,104 @@
+import numpy as np
+from pymc import *
+
+from pymc.step_methods.metropolis_hastings import *
+from pymc.distributions.mixtures import *
+from pymc.distributions.special import gammaln
+from pymc.distributions.dist_math import *
+
+import theano
+import theano.tensor as T
+import pandas
+import matplotlib.pylab as plt
+import numpy.linalg
+
+theano.config.mode = 'FAST_COMPILE'
+theano.config.compute_test_value = 'raise'
+
+mu0 = np.ones((2), dtype=np.float64)
+cov0 = np.eye(2, dtype=np.float64)
+tau0 = numpy.linalg.inv(cov0)
+
+mu1 = mu0+1.5
+cov1 = cov0.copy() * 1.4
+cov1[0,1] = 0.8
+cov1[1,0] = 0.8
+tau1 = numpy.linalg.inv(cov1)
+
+mu2 = mu0-0.5
+cov2 = cov0.copy() * 1.4
+cov2[0,1] = 0.8
+cov2[1,0] = 0.8
+tau2 = numpy.linalg.inv(cov2)
+
+mu3 = mu0/0.5 + mu2-0.6
+cov3 = cov0.copy() * 1.4
+cov3[0,1] = -0.4
+cov3[1,0] = -0.4
+tau3 = numpy.linalg.inv(cov2)
+
+m2 = np.array([3.0])
+aval = np.array([4, 1, 4., 2.])
+
+
+def test_dirichlet_args(aval, kval, value):
+        a = t.dvector('a')
+        a.tag.test_value = aval
+        
+        k = t.dscalar('k')
+        k.tag.test_value = kval
+        # only defined for sum(value) == 1
+        res = bound(
+            sum(logpow_relaxed(
+                value, a - 1) - gammaln(a), axis=0) + gammaln(sum(a)),
+
+            k > 1,
+            all(a > 0))
+        rf = theano.function([a, k], res)
+        return rf(aval, kval)
+
+print "Testing dirichlet arguments"
+print test_dirichlet_args(aval, 4, aval) 
+
+a = constant(aval)
+
+gma = gammaln(a)
+gmaf = theano.function([], gma)
+print "Gammaln of %r is %r" % (aval, gmaf())
+model = Model()
+with model:
+
+    k = 4    
+    pa = a / T.sum(a)
+
+    p, p_m1 = model.TransformedVar(
+        'p', Dirichlet.dist(a, shape=k),
+      simplextransform)
+
+    c = Categorical('c', pa)
+    gmm = MvGaussianMixture('gmm', shape=mu0.shape, cat_var=c, mus=[mu0,mu1,mu2,mu3], taus=[tau0, tau1,tau2,tau3])
+    ' Now, we ensure these arbitrary values sum to one. What do we get ? A nice probability which we can, in turn, sample from (or observe)'
+    gmm_p = Deterministic('gmm_p', gmm[0] / T.sum(gmm)) 
+    result = Bernoulli('result', p=gmm_p, observed=np.array([0,1,0,0,0,0,1,1,1], dtype=np.int32))
+    
+def run(n=50000):
+    if n == "short":
+        n = 50
+    with model:
+        ' Try this with a Metropolis instance, and watch it fail ..'
+        step = MetropolisHastings()
+        trace = sample(n, step)
+    return trace
+if __name__ == '__main__':
+    tr = run()
+    t1 = pandas.Series(tr['p'][:,0])
+    t2 = pandas.Series(tr['p'][:,1])
+    t3 = pandas.Series(tr['p'][:,2])
+    t4 = pandas.Series(tr['p'][:,3])
+    t6 = pandas.Series(tr['c'])
+    t7 = pandas.Series(tr['result'])
+    df = pandas.DataFrame({'a' : t1,'b' : t2, 'c' : t3, 'd' : t4, 'cat' : t6, result : t7})
+    pandas.scatter_matrix(df)
+    plt.show()
+
+
diff --git a/pymc/examples/mh_example.py b/pymc/examples/mh_example.py
index 3c94723054..f04676c2a6 100644
--- a/pymc/examples/mh_example.py
+++ b/pymc/examples/mh_example.py
@@ -18,7 +18,7 @@
         'p', Dirichlet.dist(a, shape=k),
        simplextransform)
 
-    c = Categorical('c', pa)
+    c = Categorical('c', p)
 
 def run(n=50000):
     if n == "short":
diff --git a/pymc/step_methods/metropolis_hastings.py b/pymc/step_methods/metropolis_hastings.py
index 48a62b09ca..ce02a00411 100644
--- a/pymc/step_methods/metropolis_hastings.py
+++ b/pymc/step_methods/metropolis_hastings.py
@@ -92,7 +92,7 @@ def __init__(self, vars, model=None):
         
         for i, v in enumerate(vars):
             assert isinstance(v.distribution, Categorical)
-            self.paramfuncs.append(model.fastfn(v.distribution.p))
+            self.paramfuncs.append(model.fastfn(T.as_tensor_variable(v.distribution.p)))
             logpt_elems.append(v.distribution.logp(v))
         
         self.logpfunc = model.fastfn(T.add(*logpt_elems))
@@ -101,12 +101,15 @@ def propose_move(self, point):
         move_logp = 0.0
         for i, v in enumerate(self.vars):
             weights = self.paramfuncs[i](point)
-            
             oldvalue = point[v.name]
-            newvalue = np.random.choice(len(weights),1, p=weights)[0]
-            'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
-            move_logp += log(weights[oldvalue]) - log(weights[newvalue])  
-            point[v.name] = newvalue
+            try:
+                newvalue = np.random.choice(len(weights),1, p=weights)[0]
+                'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
+                move_logp += log(weights[oldvalue]) - log(weights[newvalue])  
+                point[v.name] = newvalue
+            except:
+                print "Warning: Invalid weights passed to CategoricalProposal. Might happen on first step"
+                point[v.name] = 0
             
         self._proposal_logp_difference = move_logp
         return point
@@ -222,7 +225,7 @@ def mh_tune(scale, acc_rate):
     >0.95         x 10
 
     """
-    print "Acceptance rate: %f - scale: %f" % (acc_rate, scale)
+    #print "Acceptance rate: %f - scale: %f" % (acc_rate, scale)
     # Switch statement
     if acc_rate < 0.001:
         # reduce by 90 percent

From 7189a2102692df1d8070acffad38ca94265d37cc Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Fri, 6 Jun 2014 15:06:02 +0000
Subject: [PATCH 07/14] Reverted back to old (switch) implementation of bound,
 due to problems with other examples

---
 pymc/distributions/dist_math.py | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index f73426469e..7235a15416 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -33,21 +33,6 @@ def guess_is_scalar(c):
     tt = t.as_tensor_variable(c)
     return tt.type.broadcastable == ()
 
-def bound(logp, *conditions):
-    if (guess_is_scalar(logp)):
-        return bound_scalar(logp, *conditions)
-    seqs = [logp]
-    non_seqs = []
-    for i, c in enumerate(conditions):
-        if (guess_is_scalar(c)):
-            non_seqs.append(c)
-        else:
-            seqs.append(c)
-    logp_vec, updates = theano.scan(fn=bound_scalar,
-                                      outputs_info=None,
-                                      sequences=seqs, non_sequences=non_seqs)
-    return logp_vec
-
 impossible = constant(-inf, dtype=float64) # We re-use that constant
 
 def bound_scalar(logp, *conditions):
@@ -57,7 +42,7 @@ def bound_scalar(logp, *conditions):
     Parameters
     ----------
     logp : float
-    *conditionss : booleans
+    *conditions : booleans
 
     Returns
     -------
@@ -68,7 +53,7 @@ def bound_scalar(logp, *conditions):
     return ifelse(cond, logp, impossible)
 
   
-def bound_switched(logp, *conditions):
+def bound(logp, *conditions):
     """
     Bounds a log probability density with several conditions
 
@@ -82,6 +67,11 @@ def bound_switched(logp, *conditions):
     logp if all conditions are true
     -inf if some are false
     """
+    if (guess_is_scalar(logp)):
+        for c in conditions:
+            if (not guess_is_scalar(c)):
+                break
+            return bound_scalar(logp, *conditions)
     return switch(alltrue(conditions), logp, impossible)
 
 def alltrue(vals):

From aef7a2963e6b2ce251bc16cace6d2e1ec0906b30 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Fri, 6 Jun 2014 15:06:22 +0000
Subject: [PATCH 08/14] Reverted back to old (switch) implementation of bound,
 due to problems with other examples

---
 pymc/distributions/discrete.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc/distributions/discrete.py b/pymc/distributions/discrete.py
index fa78baa11f..bdc517650f 100644
--- a/pymc/distributions/discrete.py
+++ b/pymc/distributions/discrete.py
@@ -285,7 +285,7 @@ def logp(self, value):
         p = self.p
         k = self.k
 
-        return bound(log(p[value]),
+        return bound_scalar(t.sum(log(p[value])),
             value >= 0,
             value <= (k - 1),
             le(abs(sum(p) - 1), 1e-5))

From 17d0b31969888f4002a02ff375611cee03776af2 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Fri, 6 Jun 2014 15:28:50 +0000
Subject: [PATCH 09/14] Fixed case when arguments to bound_scalar have
 differing dtype

---
 pymc/distributions/dist_math.py          |   4 +-
 pymc/distributions/mixtures.py           |  54 ++++---
 pymc/examples/gmm_mixture_sampling.py    | 179 +++++++++++++----------
 pymc/step_methods/metropolis_hastings.py |  89 +++++++----
 4 files changed, 200 insertions(+), 126 deletions(-)

diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index 7235a15416..f77ab899ed 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -50,7 +50,7 @@ def bound_scalar(logp, *conditions):
     -inf if some are false
     """        
     cond = alltrue(conditions)
-    return ifelse(cond, logp, impossible)
+    return ifelse(cond, t.cast(logp, dtype='float64'), impossible)
 
   
 def bound(logp, *conditions):
@@ -69,6 +69,8 @@ def bound(logp, *conditions):
     """
     if (guess_is_scalar(logp)):
         for c in conditions:
+            if (type(c)==bool):
+                continue
             if (not guess_is_scalar(c)):
                 break
             return bound_scalar(logp, *conditions)
diff --git a/pymc/distributions/mixtures.py b/pymc/distributions/mixtures.py
index 882ec0e6bb..a8303607b0 100644
--- a/pymc/distributions/mixtures.py
+++ b/pymc/distributions/mixtures.py
@@ -47,7 +47,7 @@ def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
         assert isinstance(cat_var, FreeRV)
         assert isinstance(cat_var.distribution, Categorical)
         self.cat_var = cat_var
-        model = modelcontext(model)
+        self.model = modelcontext(model)
         weights = cat_var.distribution.p
         self.weights = weights
         self.mus = mus
@@ -56,9 +56,9 @@ def __init__(self, shape, cat_var, mus, taus, model=None, *args, **kwargs):
         self.tau_t = T.stacklists(taus)
         self.shape = shape
         self.testval = np.zeros(self.shape, self.dtype)
-        self.param_fn = model.fastfn([self.mu_t[self.cat_var], self.tau_t[self.cat_var]])
-        self.last_cov_value = None
-        self.last_tau_value = None
+        self.last_cov_value = {}
+        self.last_tau_value = {}
+        self.param_fn = None
         
     def logp(self, value):
         mu = self.mu_t[self.cat_var]
@@ -69,39 +69,47 @@ def logp(self, value):
 
         return 1/2. * (-k * log(2*pi) + log(det(tau)) - dot(delta.T, dot(tau, delta)))
     
-    def draw_mvn(self, mu, tau):
-        if (tau==self.last_tau_value):
-            mcov = self.last_cov_value
-        else:
-            mcov = np_linalg.inv(tau)
-            self.last_cov_value = mcov
+    def draw_mvn(self, mu, cov):
         
-        return np.random.multivariate_normal(mean=mu, mcov=tau)
+        return np.random.multivariate_normal(mean=mu, cov=cov)
     
     def draw(self, point):
-        mu, tau = self.param_fn(point)
-        if (tau==self.last_tau_value):
-            mcov = self.last_cov_value
+        if (self.param_fn is None):
+            self.param_fn = model.fastfn([self.cat_var, self.mu_t[self.cat_var], self.tau_t[self.cat_var]])
+        
+        cat, mu, tau = self.param_fn(point)
+        # Cache cov = inv(tau) for each value of cat
+        cat = int(cat)
+        last_tau = self.last_tau_value.get(cat, None)
+        if (last_tau is not None and np.allclose(tau,last_tau)):
+            mcov = self.last_cov_value[cat]
         else:
             mcov = np_linalg.inv(tau)
-            self.last_cov_value = mcov
-        return self.draw_mvn
+            self.last_cov_value[cat] = mcov
+            self.last_tau_value[cat] = tau
+        
+        return self.draw_mvn(mu, mcov)
     
-
+_impossible = float('-inf')
 class GMMProposal(GenericProposal):
     
     def __init__(self, gmm_var, model=None):
+        super(GMMProposal, self).__init__([gmm_var])
         assert isinstance(gmm_var.distribution, MvGaussianMixture)
         model = modelcontext(model)
         self.gmm_var = gmm_var
-        self.logpfunc = model.fastfn(gmm_var.distribution.logp)
+        self.logpfunc = model.fastfn(gmm_var.distribution.logp(gmm_var))
         self._proposal_logp_difference = 0.0
         
-    def propose_move(self, point):
-        old_logp = self.logpfunc(point)
-        point[self.gmm_var.name] = self.gmm_var.draw(point)
-        new_logp = self.logpfunc(point)
-        self._proposal_logp_difference = old_logp-new_logp
+        
+    def propose_move(self, from_point, point):
+        old_logp = self.logpfunc(from_point) 
+        point[self.gmm_var.name] = self.gmm_var.distribution.draw(point)
+        new_logp = self.logpfunc(point) 
+        if (old_logp!=np.nan and old_logp!=_impossible):
+            self._proposal_logp_difference = old_logp-new_logp
+        else:
+            self._proposal_logp_difference = 0.0            
         return point
 
     def proposal_logp_difference(self):
diff --git a/pymc/examples/gmm_mixture_sampling.py b/pymc/examples/gmm_mixture_sampling.py
index c62febb2bd..168a076fc2 100644
--- a/pymc/examples/gmm_mixture_sampling.py
+++ b/pymc/examples/gmm_mixture_sampling.py
@@ -8,6 +8,7 @@
 
 import theano
 import theano.tensor as T
+from theano.ifelse import ifelse as theano_ifelse
 import pandas
 import matplotlib.pylab as plt
 import numpy.linalg
@@ -15,90 +16,120 @@
 theano.config.mode = 'FAST_COMPILE'
 theano.config.compute_test_value = 'raise'
 
-mu0 = np.ones((2), dtype=np.float64)
-cov0 = np.eye(2, dtype=np.float64)
-tau0 = numpy.linalg.inv(cov0)
+''' 
+Example which demonstrates the use of the MvGaussianMixture as a flexible  (i.e. arbitrary) Prior
+and the usage of the MetropolisHastings step method with a custom Proposal 
+'''
 
-mu1 = mu0+1.5
-cov1 = cov0.copy() * 1.4
-cov1[0,1] = 0.8
-cov1[1,0] = 0.8
-tau1 = numpy.linalg.inv(cov1)
+# Some helper methods
+def is_pos_def(x):
+    return np.all(np.linalg.eigvals(x) > 0)
 
-mu2 = mu0-0.5
-cov2 = cov0.copy() * 1.4
-cov2[0,1] = 0.8
-cov2[1,0] = 0.8
-tau2 = numpy.linalg.inv(cov2)
-
-mu3 = mu0/0.5 + mu2-0.6
-cov3 = cov0.copy() * 1.4
-cov3[0,1] = -0.4
-cov3[1,0] = -0.4
-tau3 = numpy.linalg.inv(cov2)
-
-m2 = np.array([3.0])
-aval = np.array([4, 1, 4., 2.])
-
-
-def test_dirichlet_args(aval, kval, value):
-        a = t.dvector('a')
-        a.tag.test_value = aval
-        
-        k = t.dscalar('k')
-        k.tag.test_value = kval
-        # only defined for sum(value) == 1
-        res = bound(
-            sum(logpow_relaxed(
-                value, a - 1) - gammaln(a), axis=0) + gammaln(sum(a)),
-
-            k > 1,
-            all(a > 0))
-        rf = theano.function([a, k], res)
-        return rf(aval, kval)
-
-print "Testing dirichlet arguments"
-print test_dirichlet_args(aval, 4, aval) 
-
-a = constant(aval)
-
-gma = gammaln(a)
-gmaf = theano.function([], gma)
-print "Gammaln of %r is %r" % (aval, gmaf())
-model = Model()
-with model:
+def tmin(a,b):
+        return T.switch(T.lt(a,b), a, b)
+    
+def tmax(a,b):
+        return T.switch(T.gt(a,b), a, b)
 
-    k = 4    
-    pa = a / T.sum(a)
+''' We construct a number of valid 
+parameters for the multivariate normal.
 
-    p, p_m1 = model.TransformedVar(
-        'p', Dirichlet.dist(a, shape=k),
-      simplextransform)
+In order to ensure that the covariance matrices are positive semidefinite, they are risen to the (matrix-) power of two.
+Precision matrices are obtained by inverting the covariance matrices.
+'''
+def create_model(with_observations=False):
 
-    c = Categorical('c', pa)
-    gmm = MvGaussianMixture('gmm', shape=mu0.shape, cat_var=c, mus=[mu0,mu1,mu2,mu3], taus=[tau0, tau1,tau2,tau3])
-    ' Now, we ensure these arbitrary values sum to one. What do we get ? A nice probability which we can, in turn, sample from (or observe)'
-    gmm_p = Deterministic('gmm_p', gmm[0] / T.sum(gmm)) 
-    result = Bernoulli('result', p=gmm_p, observed=np.array([0,1,0,0,0,0,1,1,1], dtype=np.int32))
+    # First, we create some parameters for a Multivariate Normal
+    mu0 = np.ones((2), dtype=np.float64)
+    cov0 = np.eye(2, dtype=np.float64) * 0.2
+    tau0 = numpy.linalg.inv(cov0)
+    
+    mu1 = mu0+5.5
+    cov1 = cov0.copy() * 1.4
+    cov1[0,1] = 0.8
+    cov1[1,0] = 0.8
+    cov1 = np.linalg.matrix_power(cov1,2)
+    tau1 = numpy.linalg.inv(cov1)
+    
+    mu2 = mu0-2.5
+    cov2 = cov0.copy() * 1.4
+    cov2[0,1] = 0.8
+    cov2[1,0] = 0.8
+    cov2 = np.linalg.matrix_power(cov2,2)
+    tau2 = numpy.linalg.inv(cov2)
+    
+    mu3 = mu0*-0.5 + 2.0*mu2
+    cov3 = cov0.copy() * 0.9
+    cov3[0,1] = -0.8
+    cov3[1,0] = -0.8
+    cov3 = np.linalg.matrix_power(cov3,2)
+    tau3 = numpy.linalg.inv(cov3)
+    
+    m2 = np.array([3.0])
+    aval = np.array([4, 3, 2., 1.])
+    a = constant(aval)
+    
+    model = Model()
     
-def run(n=50000):
-    if n == "short":
-        n = 50
     with model:
+        k = 4    
+        p, p_m1 = model.TransformedVar(
+            'p', Dirichlet.dist(a, shape=k),
+          simplextransform)
+    
+        c = Categorical('c', p)
+        gmm = MvGaussianMixture('gmm', shape=mu0.shape, cat_var=c, mus=[mu0,mu1,mu2,mu3], taus=[tau0, tau1,tau2,tau3])
+        ' Now, we ensure these arbitrary values sum to one. What do we get ? A nice probability which we can, in turn, sample from (or observe)'
+        fac = T.constant( np.array([10.0], dtype=np.float64))
+        gmm_p = Deterministic('gmm_p', (tmax(tmin(gmm[0], fac) / fac, T.constant(np.array([0.0], dtype=np.float64)))))
+        pbeta = Beta('pbeta', 1,1)
+        if (with_observations):
+            result = Bernoulli('result', p=gmm_p[0], observed=np.array([1,0,1], dtype=np.bool))
+        else:
+            result = Bernoulli('result', p=gmm_p[0]) 
         ' Try this with a Metropolis instance, and watch it fail ..'
-        step = MetropolisHastings()
-        trace = sample(n, step)
-    return trace
-if __name__ == '__main__':
-    tr = run()
-    t1 = pandas.Series(tr['p'][:,0])
-    t2 = pandas.Series(tr['p'][:,1])
-    t3 = pandas.Series(tr['p'][:,2])
-    t4 = pandas.Series(tr['p'][:,3])
+        step = MetropolisHastings(vars=model.vars, proposals=[GMMProposal(gmm)])
+    return model, step
+
+def plot_scatter_matrix(title, tr, fig=None):
+    if (fig is None):
+        fig = plt.Figure()
     t6 = pandas.Series(tr['c'])
-    t7 = pandas.Series(tr['result'])
-    df = pandas.DataFrame({'a' : t1,'b' : t2, 'c' : t3, 'd' : t4, 'cat' : t6, result : t7})
+    t8 = pandas.Series(tr['gmm'][:,0])
+    t9 = pandas.Series(tr['gmm'][:,1])
+    t10 = pandas.Series(tr['gmm_p'][:,0])
+    t11 = pandas.Series(tr['pbeta'])
+    df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11})
     pandas.scatter_matrix(df)
+    plt.title(title)
+    return fig
+
+def plot_p_prior_vs_posterior(prior_trace, posterior_trace, fig=None):
+    if (fig is None):
+        fig = plt.Figure()
+    pr = pandas.Series(prior_trace['gmm_p'][:,0])
+    po = pandas.Series(posterior_trace['gmm_p'][:,0])
+    plt.hist(pr, bins=40, range=(-0.1, 1.1), color='b', alpha=0.5, label='Prior')
+    plt.hist(po, bins=40, range=(-0.1, 1.1), color='g', alpha=0.5, label='Posterior')
+    plt.legend()
     plt.show()
 
+def run(n=5000):
+    m1, step1 = create_model(False)
+    m2, step2 = create_model(True)
+    with m1:
+        trace1 = sample(n, step1)
+    with m2:
+        trace2 = sample(n, step2)
+    fig1 = plot_scatter_matrix('Prior Scatter Matrix', trace1)
+    plt.show()
+    fig2 = plot_scatter_matrix('Posterior Scatter Matrix', trace2)
+    plt.show()
+    plot_p_prior_vs_posterior(trace1, trace2)
+    plt.show()
+    
+if __name__ == '__main__':
+    run()
+    print "Done"
+    
 
diff --git a/pymc/step_methods/metropolis_hastings.py b/pymc/step_methods/metropolis_hastings.py
index ce02a00411..8623dc0b23 100644
--- a/pymc/step_methods/metropolis_hastings.py
+++ b/pymc/step_methods/metropolis_hastings.py
@@ -1,6 +1,7 @@
 from numpy.linalg import cholesky
 
 from ..core import *
+from ..model import FreeRV
 from .quadpotential import quad_potential
 
 from .arraystep import *
@@ -29,6 +30,16 @@
 
 class GenericProposal(object):
     
+    def __init__(self, vars):
+        '''
+        Constructor for GenericProposal. To be called by subclass constructors.
+        Arguments:
+            vars: List of variables (FreeRV instances) this GenericProposal is responsible for.
+        '''
+        for v in vars:
+            assert isinstance(v, FreeRV)
+        self.vars = vars
+    
     '''
     Asymmetric proposal distribution. 
     Has to be able to calculate move propabilities 
@@ -41,10 +52,13 @@ def proposal_logp_difference(self):
         '''
         return 0.0   
         
-    def propose_move(self, point):
+    def propose_move(self, from_point, point):
         ''' 
         Propose a move: Return a new point proposal
         Destructively writes to the given point, destroying the old values in it
+        Arguments:
+            old_point: State before proposals. Calculate previous logp with respect to this point
+            point: New proposal. Update it, and calculate new logp with respect to the updated version of this point.
         '''
         raise NotImplementedError("Called abstract method")
     
@@ -58,15 +72,16 @@ def tune_stepsize(self, stepsize_factor=1.0):
 class ProposalAdapter(GenericProposal):
     
     def __init__(self, vars, proposal_dist=NormalProposal, scale=None):
-        self.vars = vars
+        super(ProposalAdapter, self).__init__(vars)
         self.ordering = ArrayOrdering(vars)
         if scale is None:
             scale = np.ones(sum(v.dsize for v in vars))
         self.proposal = proposal_dist(scale)
         self.discrete = np.array([v.dtype in discrete_types for v in vars])
         self.stepsize_factor = 1.0
+        
 
-    def propose_move(self, point):
+    def propose_move(self, from_point, point):
         i = 0
         delta = np.atleast_1d( self.proposal() * self.stepsize_factor)
         for var, slc, shp in self.ordering.vmap:
@@ -83,6 +98,7 @@ def tune_stepsize(self, stepsize_factor=1.0):
 class CategoricalProposal(GenericProposal):
     
     def __init__(self, vars, model=None):
+        super(CategoricalProposal, self).__init__(vars)
         model = modelcontext(model)
         self.vars = vars
         varidx = T.iscalar('varidx')
@@ -97,19 +113,16 @@ def __init__(self, vars, model=None):
         
         self.logpfunc = model.fastfn(T.add(*logpt_elems))
         
-    def propose_move(self, point):
+    def propose_move(self, from_point, point):
         move_logp = 0.0
         for i, v in enumerate(self.vars):
             weights = self.paramfuncs[i](point)
             oldvalue = point[v.name]
-            try:
-                newvalue = np.random.choice(len(weights),1, p=weights)[0]
-                'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
+            newvalue = np.random.choice(len(weights),1, p=weights)[0]
+            'Move probability: Probability of moving from new state to old state divided by probability of moving from old state to new state'
+            if (oldvalue!=newvalue):
                 move_logp += log(weights[oldvalue]) - log(weights[newvalue])  
                 point[v.name] = newvalue
-            except:
-                print "Warning: Invalid weights passed to CategoricalProposal. Might happen on first step"
-                point[v.name] = 0
             
         self._proposal_logp_difference = move_logp
         return point
@@ -119,14 +132,15 @@ def proposal_logp_difference(self):
     
 
 class MetropolisHastings(object):
+    
     """
     Metropolis-Hastings sampling step
 
     Parameters
     ----------
     vars : list
-        List of variables for sampler
-    proposals : array of GenericProposal instances
+        List of variables for sampler. 
+    proposals : array of GenericProposal instances. For variables not covered by these, but present in "vars" list above, the constructor will choose Proposal distributions automatically
     scaling : scalar or array
         Initial scale factor for proposal. Defaults to 1.
     tune : bool
@@ -135,29 +149,45 @@ class MetropolisHastings(object):
         Optional model for sampling step. Defaults to None (taken from context).
 
     """
-    def __init__(self, proposals=None, scaling=1., tune=True, tune_interval=100, model=None):
+    def __init__(self, vars=None, proposals=None, scaling=1., tune=True, tune_interval=100, model=None):
         model = modelcontext(model)
-        self.model = model    
+        self.model = model
+        if ((vars is None) and (proposals is None)):
+            vars = model.vars
+            
+        covered_vars = set()
+        if (proposals is not None):
+            for p in proposals:
+                for v in p.vars:
+                    covered_vars.add(v)
+        if (vars is not None):
+            vset = set(vars) | set(covered_vars)
+        else:
+            vset = set(covered_vars)
+        self.vars = list(vset)
+        remaining_vars = vset-covered_vars
         if (proposals is None):
+            proposals = []
+        if (len(remaining_vars)>0):
             cvars = []
             ovars = []
-            for v in model.vars:
+            for v in vars: # We don't iterate over remaining vars, because the ordering of the vars might be important somehow..
+                if (v in covered_vars):
+                    continue
                 if (isinstance(v.distribution, Categorical)):
                     cvars.append(v)
                 else:
                     ovars.append(v)
+            # Make sure we sample the categorical ones first. Others might switch their regime accordingly ..
             if (len(cvars)>0):
-                proposals = [ CategoricalProposal(cvars), ProposalAdapter(ovars, NormalProposal) ]
-            else:
-                proposals = [ ProposalAdapter(ovars, NormalProposal) ]
+                proposals = [ CategoricalProposal(cvars)] + proposals
+            if (len(ovars)>0):
+                proposals.append(ProposalAdapter(ovars, NormalProposal))
         self.proposals = proposals
-        var_set = set()
+        vars = self.vars
         for p in proposals:
             p.tune_stepsize(scaling)
-            for v in p.vars:
-                var_set.add(v)
-        vars = list(var_set)
-        
+            
         self.scaling = scaling
         self.tune = tune
         self.tune_interval = tune_interval
@@ -165,13 +195,14 @@ def __init__(self, proposals=None, scaling=1., tune=True, tune_interval=100, mod
         self.accepted = 0
         self.fastlogp = self.model.fastlogp
         self.oldlogp = None
+        self.acceptance_rate = []
     
     def step(self, point):
         self._ensure_tuned()
         proposal = point.copy()
         proposal_symmetry = 0.0
         for prop in self.proposals:
-            proposal = prop.propose_move(proposal)
+            proposal = prop.propose_move(point, proposal)
             proposal_symmetry += prop.proposal_logp_difference()
 
         
@@ -186,11 +217,12 @@ def step(self, point):
         move_logp = proposal_symmetry + proposal_logp - self.oldlogp
         
         # Accept / Reject
-        if isfinite(move_logp) and log(uniform()) < move_logp:
+        if (move_logp==np.nan):
+            print "Warning: Invalid proposal in MetropolisHastings.step(..) (logp is not a number)"
+        if move_logp!=np.nan and log(uniform()) < move_logp:
             # Accept proposed value
             newpoint = proposal
             self.oldlogp = proposal_logp
-            self.accepted += 1
         else:
             # Reject proposed value
             newpoint = point#
@@ -202,8 +234,9 @@ def step(self, point):
     def _ensure_tuned(self):
         if self.tune and not self.steps_until_tune:
             # Tune scaling parameter
-            self.scaling = mh_tune(
-                self.scaling, self.accepted / float(self.tune_interval))
+            acc_rate = self.accepted / float(self.tune_interval)
+            self.acceptance_rate.append(acc_rate)
+            self.scaling = mh_tune(self.scaling, acc_rate)
             for p in self.proposals:
                 p.tune_stepsize(self.scaling)
             # Reset counter

From bad7b046e2b3b234e98e70ae3972295ee047b179 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Fri, 6 Jun 2014 15:29:19 +0000
Subject: [PATCH 10/14] Fixed case when arguments to bound_scalar have
 differing dtype

---
 pymc/distributions/continuous.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pymc/distributions/continuous.py b/pymc/distributions/continuous.py
index 1c7c4c2537..55b6fec986 100644
--- a/pymc/distributions/continuous.py
+++ b/pymc/distributions/continuous.py
@@ -8,6 +8,7 @@
 from __future__ import division
 
 from .dist_math import *
+import numpy
 from numpy.random import uniform as runiform, normal as rnormal
 
 __all__ = ['Uniform', 'Flat', 'Normal', 'Beta', 'Exponential', 'Laplace',
@@ -54,7 +55,7 @@ def logp(self, value):
         upper = self.upper
 
         return bound(
-            -log(upper - lower),
+            -log(upper - lower), # Inserted cast to prevent exception in theano.ifelse.ifelse
             lower <= value, value <= upper)
 
     def random(self, size=None):

From 89124d428fa3075129947a98022b6d0a447f65c3 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Fri, 6 Jun 2014 15:30:05 +0000
Subject: [PATCH 11/14] Small changes in GMM example

---
 pymc/examples/gmm_mixture_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc/examples/gmm_mixture_sampling.py b/pymc/examples/gmm_mixture_sampling.py
index 168a076fc2..2f49aa5e30 100644
--- a/pymc/examples/gmm_mixture_sampling.py
+++ b/pymc/examples/gmm_mixture_sampling.py
@@ -114,7 +114,7 @@ def plot_p_prior_vs_posterior(prior_trace, posterior_trace, fig=None):
     plt.legend()
     plt.show()
 
-def run(n=5000):
+def run(n=10000):
     m1, step1 = create_model(False)
     m2, step2 = create_model(True)
     with m1:

From 201d5e86d1c20e2aa94395b1c05dd896e26665be Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Sat, 14 Jun 2014 07:44:02 +0000
Subject: [PATCH 12/14] Fixed bug in Wihart distribution. Added Inverse Wishart
 distribution. Added references to papers describing how to use these for
 both.

---
 pymc/distributions/multivariate.py | 124 ++++++++++++++++++++++-------
 1 file changed, 96 insertions(+), 28 deletions(-)

diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index 945e39265c..3f209072b8 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -1,7 +1,8 @@
 from .dist_math import *
 
 from theano.sandbox.linalg import det, solve, matrix_inverse, trace
-from theano.tensor import dot, cast
+from theano.tensor import dot, cast, gt
+from theano.ifelse import ifelse
 from theano.printing import Print
 
 __all__ = ['MvNormal', 'Dirichlet', 'Multinomial', 'Wishart']
@@ -134,46 +135,113 @@ class Wishart(Continuous):
     distribution of the maximum-likelihood estimator (MLE) of the precision
     matrix of a multivariate normal distribution. If V=1, the distribution
     is identical to the chi-square distribution with n degrees of freedom.
-
-    For an alternative parameterization based on :math:`C=T{-1}` (Not yet implemented)
-
-    .. math::
-        f(X \mid n, T) = \frac{{\mid T \mid}^{n/2}{\mid X \mid}^{(n-k-1)/2}}{2^{nk/2}
-        \Gamma_p(n/2)} \exp\left\{ -\frac{1}{2} Tr(TX) \right\}
-
-    where :math:`k` is the rank of X.
-
+    
+    It is also the conjugate Prior for the Precision Matrix parameter of a
+    multivariate normal.
+
+    This follows the parameterization given in formula 290 and 291 on page 24
+    of [Kevin Murphy, Conjugate Bayesian Analysis of the Gaussian Distribution]
+    available at http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+    see that paper for further details and references.
+    
+    To create noninformative priors for covariance or precision matrices, see
+    Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    and Gelman, "Prior Distributions for variance parameters in hierarchical models"
+    ( https://faculty.washington.edu/jmiyamot/bayes/gelmana%20prior%20distributions%20f%20variance%20parameters%20i%20hierarchical%20mods.pdf )
+    as well as http://www.themattsimpson.com/2012/08/20/prior-distributions-for-covariance-matrices-the-scaled-inverse-wishart-prior/
+    and http://dahtah.wordpress.com/2012/03/07/why-an-inverse-wishart-prior-may-not-be-such-a-good-idea/
+    for a discussion of possible problems with priors of covariance and / or precision matrices
+    
     :Parameters:
-      n : int
-        Degrees of freedom, > 0.
+      v : int
+        Degrees of freedom, v > p-1 .
       V : ndarray
         p x p positive definite matrix
 
-
     :Support:
       X : matrix
         Symmetric, positive definite.
     """
-    def __init__(self, n, V, *args, **kwargs):
+    def __init__(self, v, S, *args, **kwargs):
         super(Wishart, self).__init__(*args, **kwargs)
-        self.n = n
-        self.p = p = V.shape[0]
-        self.V = V
-        self.mean = n * V
-        self.mode = switch(1*(n >= p + 1),
-                     (n - p - 1) * V,
+        self.v = v
+        self.p = p = S.shape[0]
+        self.S = S
+        
+        self.mean = v * S
+        self.mode = switch(1*(v > p + 1),
+                     (v - p - 1) * S,
                       nan)
+        'TODO: We should pre-compute the following if the parameters are fixed'
+        self.invalid = theano.tensor.fill(S, nan) # Invalid result, if v<p
+        self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) + log(det(S)) * v / 2.,
+        self.inv_S = matrix_inverse(S)    
 
     def logp(self, X):
-        n = self.n
+        v = self.v
         p = self.p
-        V = self.V
+        Z = self.Z
+        inv_S = self.inv_S 
+        result = -Z + log(det(X)) * (v - p - 1) / 2. - trace(inv_S.dot(X)) / 2.
+        return ifelse(gt(v, p-1), result, self.invalid) 
+    
+    @staticmethod
+    def jeffreys_prior():
+        
 
-        IVI = det(V)
 
-        return bound(
-            ((n - p - 1) * log(IVI) - trace(matrix_inverse(V).dot(X)) -
-             n * p * log(
-             2) - n * log(IVI) - 2 * multigammaln(p, n / 2)) / 2,
 
-             n > (p - 1))
+class InverseWishart(Continuous):
+    """
+    The Inverse Wishart distribution is the conjugate prior
+    for the covariance matrix of a multivariate normal. It is also 
+    the distribution of the maximum-likelihood estimator (MLE) of the covariance
+    matrix of a multivariate normal distribution. 
+    
+    This follows the parameterization given in formula 296 and 297 on page 25
+    of [Kevin Murphy, Conjugate Bayesian Analysis of the Gaussian Distribution]
+    available at http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+    see that paper for further details and references.
+
+    To create noninformative priors for covariance or precision matrices, see
+    Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    and Gelman, "Prior Distributions for variance parameters in hierarchical models"
+    ( https://faculty.washington.edu/jmiyamot/bayes/gelmana%20prior%20distributions%20f%20variance%20parameters%20i%20hierarchical%20mods.pdf )
+    as well as http://www.themattsimpson.com/2012/08/20/prior-distributions-for-covariance-matrices-the-scaled-inverse-wishart-prior/
+    and http://dahtah.wordpress.com/2012/03/07/why-an-inverse-wishart-prior-may-not-be-such-a-good-idea/
+    for a discussion of possible problems with priors of covariance and / or precision matrices
+    
+    :Parameters:
+      v : int
+        Degrees of freedom, v > p - 1
+      inv_S : ndarray
+        p x p positive definite matrix (inverted scale matrix)
+
+
+    :Support:
+      X : matrix
+        Symmetric, positive definite.
+    """
+    def __init__(self, v, inv_S, *args, **kwargs):
+        super(Wishart, self).__init__(*args, **kwargs)
+        self.v = v
+        self.p = p = inv_S.shape[0]
+        self.inv_S = inv_S
+        
+        'TODO: We should pre-compute the following if the parameters are fixed'
+        S = matrix_inverse(inv_S)   
+        self.S = S
+        self.invalid = theano.tensor.fill(inv_S, nan) # Invalid result, if v<p
+        self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) - log(det(S)) * v / 2.,
+        self.mean = ifelse(gt(v, p-1), S / ( v - p - 1), self.invalid) 
+
+         
+    def logp(self, X):
+        v = self.v
+        p = self.p
+        S = self.S
+        Z = self.Z
+        result = -Z + log(det(X)) * -(v + p + 1.) / 2. - trace(S.dot(matrix_inverse(X))) / 2.
+        return ifelse(gt(v, p-1), result, self.invalid) 

From a29c89039a415d3957865b6b52a5ba0216013508 Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Sat, 14 Jun 2014 18:38:46 +0000
Subject: [PATCH 13/14] Added constructor for a special noninformative prior
 for covariance matrices

---
 pymc/distributions/multivariate.py   | 32 ++++++++++++++++++++++++++-
 pymc/distributions/special_priors.py | 33 ++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 pymc/distributions/special_priors.py

diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index 3f209072b8..d906243656 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -237,7 +237,6 @@ def __init__(self, v, inv_S, *args, **kwargs):
         self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) - log(det(S)) * v / 2.,
         self.mean = ifelse(gt(v, p-1), S / ( v - p - 1), self.invalid) 
 
-         
     def logp(self, X):
         v = self.v
         p = self.p
@@ -245,3 +244,34 @@ def logp(self, X):
         Z = self.Z
         result = -Z + log(det(X)) * -(v + p + 1.) / 2. - trace(S.dot(matrix_inverse(X))) / 2.
         return ifelse(gt(v, p-1), result, self.invalid) 
+    
+        
+def noninformative_covariance_prior(name1, name2, d, model=None):
+    '''
+    Construct a two part noninformative prior for the covariance matrix
+    following Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    
+    The resulting prior has an almost flat half-t distribution over the variables variances,
+    while providing an uniform-prior (range [-1,1]for the offdiagonal elements of the 
+    correlation matrix corresponding to the covariance matrix.
+    
+
+    
+    Arguments:
+        name1: Name for the Inverse Wishart distribution which will be created
+        name2: Name for the Inverse Gamma distribution which will be created as a prior for the diagonal elements of the inv_S param of the Inverse Wishart
+        d: Dimensionality, i.e. number of variables to create a joint covariance prior for.
+        model: (optional) the model
+    '''
+    from sys import float_info
+    import numpy as np
+    from pymc.distributions.continuous import InverseGamma
+    from theano.sandbox.linalg.ops import diag, psd
+    A = float_info.max / 4. # Large number
+    d_ones = np.ones(d, dtype=np.float64)
+    a_hyperprior = InverseGamma(name=name2, d_ones/2., d_ones / A, model=model)
+    '  Note that the InverseWishart in this library is parameterized with the inverse of S, So we do not divide by a_hyperprior, but use it more directly.'
+    invert a diagonal matrix.'
+    cov_prior = InverseWishart(name=name1, d+1, 4. * a_hyperprior, model=model)
+    return cov_prior, a_hyperprior
diff --git a/pymc/distributions/special_priors.py b/pymc/distributions/special_priors.py
new file mode 100644
index 0000000000..3162a816c8
--- /dev/null
+++ b/pymc/distributions/special_priors.py
@@ -0,0 +1,33 @@
+
+from .multivariate import InverseWishart
+from .continuous import InverseGamma
+from sys import float_info
+import numpy as np
+from pymc.distributions.continuous import InverseGamma
+from theano.sandbox.linalg.ops import diag, psd
+
+def create_noninformative_covariance_prior(name1, name2, d, model=None):
+    '''
+    Construct a two part noninformative prior for the covariance matrix
+    following Huang and Wang, "Simple Marginally Noninformative Prior Distributions
+    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
+    
+    The resulting prior has an almost flat half-t distribution over the variance of the variables,
+    while efficiently having an uniform-prior (range [-1,1] for the correlation coefficients.
+        
+    Arguments:
+        name1: Name for the Inverse Wishart distribution which will be created
+        name2: Name for the Inverse Gamma distribution which will be created as a prior for the diagonal elements of the inv_S param of the Inverse Wishart
+        d: Dimensionality, i.e. number of variables to create a joint covariance prior for.
+        model: (optional) the model
+    Returns:
+    A tuple consisting of the covariance prior (an InverseWishart), and the hyperprior (InverseGamma) for the
+    diagonal elements of the InverseWishart inv_S parameter
+    '''
+
+    A = float_info.max / 4. # Large number
+    d_ones = np.ones(d, dtype=np.float64)
+    a_hyperprior = InverseGamma(name=name2, d_ones/2., d_ones / A, model=model)
+    '  Note that the InverseWishart in this library is parameterized with the inverse of S, So we do not divide by a_hyperprior, but use it more directly.'
+    cov_prior = InverseWishart(name=name1, d+1, diag(4. * a_hyperprior), model=model)
+    return cov_prior, a_hyperprior
\ No newline at end of file

From b44d10a8fc29db99e3347e27cf3729362fdef6df Mon Sep 17 00:00:00 2001
From: Kai Londenberg <kai.londenberg@gfk.com>
Date: Sat, 14 Jun 2014 20:48:26 +0000
Subject: [PATCH 14/14] Small changes to the InverseWishart and the special
 covariance prior

---
 pymc/distributions/multivariate.py   | 51 +++++++---------------------
 pymc/distributions/special_priors.py | 19 +++++++----
 2 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index d906243656..76eda5d61f 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -156,8 +156,8 @@ class Wishart(Continuous):
     :Parameters:
       v : int
         Degrees of freedom, v > p-1 .
-      V : ndarray
-        p x p positive definite matrix
+      S : ndarray
+        p x p positive definite matrix (scale matrix / scatter matrix). Can be interpreted as the scatter matrix of v p-variate observations.
 
     :Support:
       X : matrix
@@ -203,6 +203,8 @@ class InverseWishart(Continuous):
     of [Kevin Murphy, Conjugate Bayesian Analysis of the Gaussian Distribution]
     available at http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
     see that paper for further details and references.
+    
+    We parameterize with the scale matrix (scatter matrix) of possibly virtual observations, not with it's inverse.
 
     To create noninformative priors for covariance or precision matrices, see
     Huang and Wang, "Simple Marginally Noninformative Prior Distributions
@@ -216,24 +218,23 @@ class InverseWishart(Continuous):
     :Parameters:
       v : int
         Degrees of freedom, v > p - 1
-      inv_S : ndarray
-        p x p positive definite matrix (inverted scale matrix)
+      S : ndarray
+        p x p positive definite matrix (scale matrix / scatter matrix). Can be interpreted as the scatter matrix of v p-variate observations.
 
 
     :Support:
       X : matrix
         Symmetric, positive definite.
     """
-    def __init__(self, v, inv_S, *args, **kwargs):
+    def __init__(self, v, S, *args, **kwargs):
         super(Wishart, self).__init__(*args, **kwargs)
         self.v = v
-        self.p = p = inv_S.shape[0]
-        self.inv_S = inv_S
-        
-        'TODO: We should pre-compute the following if the parameters are fixed'
-        S = matrix_inverse(inv_S)   
         self.S = S
-        self.invalid = theano.tensor.fill(inv_S, nan) # Invalid result, if v<p
+        self.p = p = S.shape[0]
+        self.inv_S = matrix_inverse(S)
+        
+        'TODO: We should pre-compute the following if the parameters are fixed'   
+        self.invalid = theano.tensor.fill(S, nan) # Invalid result, if v<p
         self.Z = log(2.)*(v * p / 2.) + multigammaln(p, v / 2.) - log(det(S)) * v / 2.,
         self.mean = ifelse(gt(v, p-1), S / ( v - p - 1), self.invalid) 
 
@@ -246,32 +247,4 @@ def logp(self, X):
         return ifelse(gt(v, p-1), result, self.invalid) 
     
         
-def noninformative_covariance_prior(name1, name2, d, model=None):
-    '''
-    Construct a two part noninformative prior for the covariance matrix
-    following Huang and Wang, "Simple Marginally Noninformative Prior Distributions
-    for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
-    
-    The resulting prior has an almost flat half-t distribution over the variables variances,
-    while providing an uniform-prior (range [-1,1]for the offdiagonal elements of the 
-    correlation matrix corresponding to the covariance matrix.
-    
 
-    
-    Arguments:
-        name1: Name for the Inverse Wishart distribution which will be created
-        name2: Name for the Inverse Gamma distribution which will be created as a prior for the diagonal elements of the inv_S param of the Inverse Wishart
-        d: Dimensionality, i.e. number of variables to create a joint covariance prior for.
-        model: (optional) the model
-    '''
-    from sys import float_info
-    import numpy as np
-    from pymc.distributions.continuous import InverseGamma
-    from theano.sandbox.linalg.ops import diag, psd
-    A = float_info.max / 4. # Large number
-    d_ones = np.ones(d, dtype=np.float64)
-    a_hyperprior = InverseGamma(name=name2, d_ones/2., d_ones / A, model=model)
-    '  Note that the InverseWishart in this library is parameterized with the inverse of S, So we do not divide by a_hyperprior, but use it more directly.'
-    invert a diagonal matrix.'
-    cov_prior = InverseWishart(name=name1, d+1, 4. * a_hyperprior, model=model)
-    return cov_prior, a_hyperprior
diff --git a/pymc/distributions/special_priors.py b/pymc/distributions/special_priors.py
index 3162a816c8..9fcf902b2b 100644
--- a/pymc/distributions/special_priors.py
+++ b/pymc/distributions/special_priors.py
@@ -6,19 +6,24 @@
 from pymc.distributions.continuous import InverseGamma
 from theano.sandbox.linalg.ops import diag, psd
 
-def create_noninformative_covariance_prior(name1, name2, d, model=None):
+def huang_wang_covariance_prior(name1, name2, d, observed_scatter_matrix=None, observed_sample_size=0, model=None):
     '''
-    Construct a two part noninformative prior for the covariance matrix
+    Construct a noninformative or informative prior for a mv normal covariance matrix
     following Huang and Wang, "Simple Marginally Noninformative Prior Distributions
     for Covariance Matrices" ( http://ba.stat.cmu.edu/journal/2013/vol08/issue02/huang.pdf )
     
-    The resulting prior has an almost flat half-t distribution over the variance of the variables,
-    while efficiently having an uniform-prior (range [-1,1] for the correlation coefficients.
+    If no observations are included, the resulting prior has an almost flat half-t distribution
+    over the variance of the variables, while efficiently having an 
+    uniform-prior (range [-1,1] for the correlation coefficients.
+    
+    This prior does not introduce a dependency between variance and correlation strength, as happens with
+    a simple InverseWishart prior.
         
     Arguments:
         name1: Name for the Inverse Wishart distribution which will be created
         name2: Name for the Inverse Gamma distribution which will be created as a prior for the diagonal elements of the inv_S param of the Inverse Wishart
         d: Dimensionality, i.e. number of variables to create a joint covariance prior for.
+        observed_scatter_matrix: d x d dimensional scatter matrix of (possibly virtual) observations to combine the noninformative prior with, to form an informative prior (or posterior).
         model: (optional) the model
     Returns:
     A tuple consisting of the covariance prior (an InverseWishart), and the hyperprior (InverseGamma) for the
@@ -28,6 +33,8 @@ def create_noninformative_covariance_prior(name1, name2, d, model=None):
     A = float_info.max / 4. # Large number
     d_ones = np.ones(d, dtype=np.float64)
     a_hyperprior = InverseGamma(name=name2, d_ones/2., d_ones / A, model=model)
-    '  Note that the InverseWishart in this library is parameterized with the inverse of S, So we do not divide by a_hyperprior, but use it more directly.'
-    cov_prior = InverseWishart(name=name1, d+1, diag(4. * a_hyperprior), model=model)
+    S = diag(4. / a_hyperprior)
+    if (observed_scatter_matrix is not None):
+        S = S + observed_scatter_matrix
+    cov_prior = InverseWishart(name=name1, d+1+observed_sample_size, S, model=model)
     return cov_prior, a_hyperprior
\ No newline at end of file