Skip to content

Commit 9dbf9ea

Browse files
Add numerically safe ordered probit distribution. (#4232)
* Add numerically safer ordered probit distribution. * Fix styling issue. * Use smaller values for probit & cutpoint testing * fix import sorting * Typo fix * add a line to RELEASE-NOTES.md Co-authored-by: Michael Osthege <[email protected]> Co-authored-by: Michael Osthege <[email protected]>
1 parent dbcc49e commit 9dbf9ea

File tree

5 files changed

+197
-1
lines changed

5 files changed

+197
-1
lines changed

RELEASE-NOTES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ This is the first release to support Python3.9 and to drop Python3.6.
99
- Make `sample_shape` same across all contexts in `draw_values` (see [#4305](https://github.com/pymc-devs/pymc3/pull/4305)).
1010
- Removed `theanof.set_theano_config` because it illegally touched Theano's privates (see [#4329](https://github.com/pymc-devs/pymc3/pull/4329)).
1111

12+
### New Features
13+
- `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)).
1214

1315
## PyMC3 3.10.0 (7 December 2020)
1416

pymc3/distributions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
HyperGeometric,
6262
NegativeBinomial,
6363
OrderedLogistic,
64+
OrderedProbit,
6465
Poisson,
6566
ZeroInflatedBinomial,
6667
ZeroInflatedNegativeBinomial,
@@ -140,6 +141,7 @@
140141
"HyperGeometric",
141142
"Categorical",
142143
"OrderedLogistic",
144+
"OrderedProbit",
143145
"DensityDist",
144146
"Distribution",
145147
"Continuous",

pymc3/distributions/discrete.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
binomln,
2525
bound,
2626
factln,
27+
log_diff_normal_cdf,
2728
logpow,
29+
normal_lccdf,
30+
normal_lcdf,
2831
random_choice,
2932
)
3033
from pymc3.distributions.distribution import Discrete, draw_values, generate_samples
@@ -1638,3 +1641,131 @@ def __init__(self, eta, cutpoints, *args, **kwargs):
16381641
p = p_cum[..., 1:] - p_cum[..., :-1]
16391642

16401643
super().__init__(p=p, *args, **kwargs)
1644+
1645+
1646+
class OrderedProbit(Categorical):
1647+
R"""
1648+
Ordered Probit log-likelihood.
1649+
1650+
Useful for regression on ordinal data values whose values range
1651+
from 1 to K as a function of some predictor, :math:`\eta`. The
1652+
cutpoints, :math:`c`, separate which ranges of :math:`\eta` are
1653+
mapped to which of the K observed dependent variables. The number
1654+
of cutpoints is K - 1. It is recommended that the cutpoints are
1655+
constrained to be ordered.
1656+
1657+
In order to stabilize the computation, log-likelihood is computed
1658+
in log space using the scaled error function `erfcx`.
1659+
1660+
.. math::
1661+
1662+
f(k \mid \eta, c) = \left\{
1663+
\begin{array}{l}
1664+
1 - \text{normal_cdf}(0, \sigma, \eta - c_1)
1665+
\,, \text{if } k = 0 \\
1666+
\text{normal_cdf}(0, \sigma, \eta - c_{k - 1}) -
1667+
\text{normal_cdf}(0, \sigma, \eta - c_{k})
1668+
\,, \text{if } 0 < k < K \\
1669+
\text{normal_cdf}(0, \sigma, \eta - c_{K - 1})
1670+
\,, \text{if } k = K \\
1671+
\end{array}
1672+
\right.
1673+
1674+
Parameters
1675+
----------
1676+
eta : float
1677+
The predictor.
1678+
c : array
1679+
The length K - 1 array of cutpoints which break :math:`\eta` into
1680+
ranges. Do not explicitly set the first and last elements of
1681+
:math:`c` to negative and positive infinity.
1682+
1683+
sigma: float
1684+
The standard deviation of probit function.
1685+
Example
1686+
--------
1687+
.. code:: python
1688+
1689+
# Generate data for a simple 1 dimensional example problem
1690+
n1_c = 300; n2_c = 300; n3_c = 300
1691+
cluster1 = np.random.randn(n1_c) + -1
1692+
cluster2 = np.random.randn(n2_c) + 0
1693+
cluster3 = np.random.randn(n3_c) + 2
1694+
1695+
x = np.concatenate((cluster1, cluster2, cluster3))
1696+
y = np.concatenate((1*np.ones(n1_c),
1697+
2*np.ones(n2_c),
1698+
3*np.ones(n3_c))) - 1
1699+
1700+
# Ordered probit regression
1701+
with pm.Model() as model:
1702+
cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
1703+
transform=pm.distributions.transforms.ordered)
1704+
y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y)
1705+
tr = pm.sample(1000)
1706+
1707+
# Plot the results
1708+
plt.hist(cluster1, 30, alpha=0.5);
1709+
plt.hist(cluster2, 30, alpha=0.5);
1710+
plt.hist(cluster3, 30, alpha=0.5);
1711+
plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k');
1712+
plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k');
1713+
1714+
"""
1715+
1716+
def __init__(self, eta, cutpoints, *args, **kwargs):
1717+
1718+
self.eta = tt.as_tensor_variable(floatX(eta))
1719+
self.cutpoints = tt.as_tensor_variable(cutpoints)
1720+
1721+
probits = tt.shape_padright(self.eta) - self.cutpoints
1722+
_log_p = tt.concatenate(
1723+
[
1724+
tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])),
1725+
log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]),
1726+
tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])),
1727+
],
1728+
axis=-1,
1729+
)
1730+
_log_p = tt.as_tensor_variable(floatX(_log_p))
1731+
1732+
self._log_p = _log_p
1733+
self.mode = tt.argmax(_log_p, axis=-1)
1734+
p = tt.exp(_log_p)
1735+
1736+
super().__init__(p=p, *args, **kwargs)
1737+
1738+
def logp(self, value):
1739+
r"""
1740+
Calculate log-probability of Ordered Probit distribution at specified value.
1741+
1742+
Parameters
1743+
----------
1744+
value: numeric
1745+
Value(s) for which log-probability is calculated. If the log probabilities for multiple
1746+
values are desired the values must be provided in a numpy array or theano tensor
1747+
1748+
Returns
1749+
-------
1750+
TensorVariable
1751+
"""
1752+
logp = self._log_p
1753+
k = self.k
1754+
1755+
# Clip values before using them for indexing
1756+
value_clip = tt.clip(value, 0, k - 1)
1757+
1758+
if logp.ndim > 1:
1759+
if logp.ndim > value_clip.ndim:
1760+
value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim)
1761+
elif logp.ndim < value_clip.ndim:
1762+
logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim)
1763+
pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1))
1764+
a = take_along_axis(
1765+
logp.dimshuffle(pattern),
1766+
value_clip,
1767+
)
1768+
else:
1769+
a = logp[value_clip]
1770+
1771+
return bound(a, value >= 0, value <= (k - 1))

pymc3/distributions/dist_math.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,46 @@ def normal_lccdf(mu, sigma, x):
134134
)
135135

136136

137+
def log_diff_normal_cdf(mu, sigma, x, y):
138+
"""
139+
Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space.
140+
141+
Parameters
142+
----------
143+
mu: float
144+
mean
145+
sigma: float
146+
std
147+
148+
x: float
149+
150+
y: float
151+
must be strictly less than x.
152+
153+
Returns
154+
-------
155+
log (\\Phi(x) - \\Phi(y))
156+
157+
"""
158+
x = (x - mu) / sigma / tt.sqrt(2.0)
159+
y = (y - mu) / sigma / tt.sqrt(2.0)
160+
161+
# To stabilize the computation, consider these three regions:
162+
# 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
163+
# 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
164+
# 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
165+
return tt.log(0.5) + tt.switch(
166+
tt.gt(y, 0),
167+
-tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)),
168+
tt.switch(
169+
tt.lt(x, 0), # 0 > x > y
170+
-tt.square(x)
171+
+ tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)),
172+
tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y
173+
),
174+
)
175+
176+
137177
def sigma2rho(sigma):
138178
"""
139179
`sigma -> rho` theano converter

pymc3/tests/test_distributions.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from numpy import array, exp, inf, log
2727
from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
2828
from scipy import integrate
29-
from scipy.special import logit
29+
from scipy.special import erf, logit
3030

3131
import pymc3 as pm
3232

@@ -74,6 +74,7 @@
7474
NegativeBinomial,
7575
Normal,
7676
OrderedLogistic,
77+
OrderedProbit,
7778
Pareto,
7879
Poisson,
7980
Rice,
@@ -431,6 +432,17 @@ def orderedlogistic_logpdf(value, eta, cutpoints):
431432
return np.where(np.all(ps >= 0), np.log(p), -np.inf)
432433

433434

435+
def invprobit(x):
436+
return (erf(x / np.sqrt(2)) + 1) / 2
437+
438+
439+
def orderedprobit_logpdf(value, eta, cutpoints):
440+
c = np.concatenate(([-np.inf], cutpoints, [np.inf]))
441+
ps = np.array([invprobit(eta - cc) - invprobit(eta - cc1) for cc, cc1 in zip(c[:-1], c[1:])])
442+
p = ps[value]
443+
return np.where(np.all(ps >= 0), np.log(p), -np.inf)
444+
445+
434446
class Simplex:
435447
def __init__(self, n):
436448
self.vals = list(simplex_values(n))
@@ -1536,6 +1548,15 @@ def test_orderedlogistic(self, n):
15361548
lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints),
15371549
)
15381550

1551+
@pytest.mark.parametrize("n", [2, 3, 4])
1552+
def test_orderedprobit(self, n):
1553+
self.pymc3_matches_scipy(
1554+
OrderedProbit,
1555+
Domain(range(n), "int64"),
1556+
{"eta": Runif, "cutpoints": UnitSortedVector(n - 1)},
1557+
lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints),
1558+
)
1559+
15391560
def test_densitydist(self):
15401561
def logp(x):
15411562
return -log(2 * 0.5) - abs(x - 0.5) / 0.5

0 commit comments

Comments
 (0)