Implement Hurdle distributions

tomicapretto · ricardoV94 · commit a53e8658e8df · 2023-05-10T14:02:34.000+02:00
diff --git a/docs/source/api/distributions/mixture.rst b/docs/source/api/distributions/mixture.rst
@@ -11,3 +11,7 @@ Mixture
    ZeroInflatedBinomial
    ZeroInflatedNegativeBinomial
    ZeroInflatedPoisson
+   HurdlePoisson
+   HurdleNegativeBinomial
+   HurdleGamma
+   HurdleLogNormal
diff --git a/pymc/distributions/__init__.py b/pymc/distributions/__init__.py
@@ -74,6 +74,10 @@
     SymbolicRandomVariable,
 )
 from pymc.distributions.mixture import (
+    HurdleGamma,
+    HurdleLogNormal,
+    HurdleNegativeBinomial,
+    HurdlePoisson,
     Mixture,
     NormalMixture,
     ZeroInflatedBinomial,
@@ -195,4 +199,8 @@
     "Censored",
     "CAR",
     "PolyaGamma",
+    "HurdleGamma",
+    "HurdleLogNormal",
+    "HurdleNegativeBinomial",
+    "HurdlePoisson",
 ]
diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
@@ -22,7 +22,7 @@
 from pytensor.tensor.random.op import RandomVariable
 
 from pymc.distributions import transforms
-from pymc.distributions.continuous import Normal, get_tau_sigma
+from pymc.distributions.continuous import Gamma, LogNormal, Normal, get_tau_sigma
 from pymc.distributions.discrete import Binomial, NegativeBinomial, Poisson
 from pymc.distributions.dist_math import check_parameters
 from pymc.distributions.distribution import (
@@ -34,6 +34,7 @@
 )
 from pymc.distributions.shape_utils import _change_dist_size, change_dist_size
 from pymc.distributions.transforms import _default_transform
+from pymc.distributions.truncated import Truncated
 from pymc.logprob.abstract import _logcdf, _logcdf_helper, _logprob, _logprob_helper
 from pymc.logprob.transforms import IntervalTransform
 from pymc.logprob.utils import ignore_logprob
@@ -42,6 +43,10 @@
 from pymc.vartypes import continuous_types, discrete_types
 
 __all__ = [
+    "HurdleGamma",
+    "HurdleLogNormal",
+    "HurdleNegativeBinomial",
+    "HurdlePoisson",
     "Mixture",
     "NormalMixture",
     "ZeroInflatedBinomial",
@@ -794,3 +799,241 @@ def dist(cls, psi, mu=None, alpha=None, p=None, n=None, **kwargs):
             nonzero_dist=NegativeBinomial.dist(mu=mu, alpha=alpha, p=p, n=n),
             **kwargs,
         )
+
+
+def _hurdle_mixture(*, name, nonzero_p, nonzero_dist, dtype, **kwargs):
+    """Helper function to create a hurdle mixtures
+
+    If name is `None`, this function returns an unregistered variable
+
+    In hurdle models, the zeros come from a completely different process than the rest of the data.
+    In other words, the zeros are not inflated, they come from a different process.
+    """
+    if dtype == "float":
+        zero = 0.0
+        lower = np.finfo(pytensor.config.floatX).eps
+    elif dtype == "int":
+        zero = 0
+        lower = 1
+    else:
+        raise ValueError("dtype must be 'float' or 'int'")
+
+    nonzero_p = pt.as_tensor_variable(floatX(nonzero_p))
+    weights = pt.stack([1 - nonzero_p, nonzero_p], axis=-1)
+    comp_dists = [
+        DiracDelta.dist(zero),
+        Truncated.dist(nonzero_dist, lower=lower),
+    ]
+
+    if name is not None:
+        return Mixture(name, weights, comp_dists, **kwargs)
+    else:
+        return Mixture.dist(weights, comp_dists, **kwargs)
+
+
+class HurdlePoisson:
+    R"""
+    Hurdle Poisson log-likelihood.
+
+    The Poisson distribution is often used to model the number of events occurring
+    in a fixed period of time or space when the times or locations
+    at which events occur are independent.
+
+    The difference with ZeroInflatedPoisson is that the zeros are not inflated,
+    they come from a completely independent process.
+
+    The pmf of this distribution is
+
+    .. math::
+
+        f(x \mid \psi, \mu) =
+            \left\{
+                \begin{array}{l}
+                (1 - \psi)  \ \text{if } x = 0 \\
+                \psi
+                \frac{\text{PoissonPDF}(x \mid \mu))}
+                {1 - \text{PoissonCDF}(0 \mid \mu)} \ \text{if } x=1,2,3,\ldots
+                \end{array}
+            \right.
+
+
+    Parameters
+    ----------
+    psi : tensor_like of float
+        Expected proportion of Poisson variates (0 < psi < 1)
+    mu : tensor_like of float
+        Expected number of occurrences (mu >= 0).
+    """
+
+    def __new__(cls, name, psi, mu, **kwargs):
+        return _hurdle_mixture(
+            name=name, nonzero_p=psi, nonzero_dist=Poisson.dist(mu=mu), dtype="int", **kwargs
+        )
+
+    @classmethod
+    def dist(cls, psi, mu, **kwargs):
+        return _hurdle_mixture(
+            name=None, nonzero_p=psi, nonzero_dist=Poisson.dist(mu=mu), dtype="int", **kwargs
+        )
+
+
+class HurdleNegativeBinomial:
+    R"""
+    Hurdle Negative Binomial log-likelihood.
+
+    The negative binomial distribution describes a Poisson random variable
+    whose rate parameter is gamma distributed.
+
+    The difference with ZeroInflatedNegativeBinomial is that the zeros are not inflated,
+    they come from a completely independent process.
+
+    The pmf of this distribution is
+
+    .. math::
+
+        f(x \mid \psi, \mu, \alpha) =
+            \left\{
+                \begin{array}{l}
+                (1 - \psi)  \ \text{if } x = 0 \\
+                \psi
+                \frac{\text{NegativeBinomialPDF}(x \mid \mu, \alpha))}
+                {1 - \text{NegativeBinomialCDF}(0 \mid \mu, \alpha)} \ \text{if } x=1,2,3,\ldots
+                \end{array}
+            \right.
+
+    Parameters
+    ----------
+    psi : tensor_like of float
+        Expected proportion of Negative Binomial variates (0 < psi < 1)
+    alpha : tensor_like of float
+        Gamma distribution shape parameter (alpha > 0).
+    mu : tensor_like of float
+        Gamma distribution mean (mu > 0).
+    p : tensor_like of float
+        Alternative probability of success in each trial (0 < p < 1).
+    n : tensor_like of float
+        Alternative number of target success trials (n > 0)
+    """
+
+    def __new__(cls, name, psi, mu=None, alpha=None, p=None, n=None, **kwargs):
+        return _hurdle_mixture(
+            name=name,
+            nonzero_p=psi,
+            nonzero_dist=NegativeBinomial.dist(mu=mu, alpha=alpha, p=p, n=n),
+            dtype="int",
+            **kwargs,
+        )
+
+    @classmethod
+    def dist(cls, psi, mu=None, alpha=None, p=None, n=None, **kwargs):
+        return _hurdle_mixture(
+            name=None,
+            nonzero_p=psi,
+            nonzero_dist=NegativeBinomial.dist(mu=mu, alpha=alpha, p=p, n=n),
+            dtype="int",
+            **kwargs,
+        )
+
+
+class HurdleGamma:
+    R"""
+    Hurdle Gamma log-likelihood.
+
+    .. math::
+
+        f(x \mid \psi, \alpha, \beta) =
+            \left\{
+                \begin{array}{l}
+                (1 - \psi)  \ \text{if } x = 0 \\
+                \psi
+                \frac{\text{GammaPDF}(x \mid \alpha, \beta))}
+                {1 - \text{GammaCDF}(\epsilon \mid \alpha, \beta)} \ \text{if } x=1,2,3,\ldots
+                \end{array}
+            \right.
+
+    where :math:`\epsilon` is the machine precision.
+
+    Parameters
+    ----------
+    psi : tensor_like of float
+        Expected proportion of Gamma variates (0 < psi < 1)
+    alpha : tensor_like of float, optional
+        Shape parameter (alpha > 0).
+    beta : tensor_like of float, optional
+        Rate parameter (beta > 0).
+    mu : tensor_like of float, optional
+        Alternative shape parameter (mu > 0).
+    sigma : tensor_like of float, optional
+        Alternative scale parameter (sigma > 0).
+    """
+
+    def __new__(cls, name, psi, alpha=None, beta=None, mu=None, sigma=None, **kwargs):
+        return _hurdle_mixture(
+            name=name,
+            nonzero_p=psi,
+            nonzero_dist=Gamma.dist(alpha=alpha, beta=beta, mu=mu, sigma=sigma),
+            dtype="float",
+            **kwargs,
+        )
+
+    @classmethod
+    def dist(cls, psi, alpha=None, beta=None, mu=None, sigma=None, **kwargs):
+        return _hurdle_mixture(
+            name=None,
+            nonzero_p=psi,
+            nonzero_dist=Gamma.dist(alpha=alpha, beta=beta, mu=mu, sigma=sigma),
+            dtype="float",
+            **kwargs,
+        )
+
+
+class HurdleLogNormal:
+    R"""
+    Hurdle LogNormal log-likelihood.
+
+    .. math::
+
+        f(x \mid \psi, \mu, \sigma) =
+            \left\{
+                \begin{array}{l}
+                (1 - \psi)  \ \text{if } x = 0 \\
+                \psi
+                \frac{\text{LogNormalPDF}(x \mid \mu, \sigma))}
+                {1 - \text{LogNormalCDF}(\epsilon \mid \mu, \sigma)} \ \text{if } x=1,2,3,\ldots
+                \end{array}
+            \right.
+
+    where :math:`\epsilon` is the machine precision.
+
+    Parameters
+    ----------
+    psi : tensor_like of float
+        Expected proportion of Gamma variates (0 < psi < 1)
+    mu : tensor_like of float, default 0
+        Location parameter.
+    sigma : tensor_like of float, optional
+        Standard deviation. (sigma > 0). (only required if tau is not specified).
+        Defaults to 1.
+    tau : tensor_like of float, optional
+        Scale parameter (tau > 0). (only required if sigma is not specified).
+        Defaults to 1.
+    """
+
+    def __new__(cls, name, psi, mu=0, sigma=None, tau=None, **kwargs):
+        return _hurdle_mixture(
+            name=name,
+            nonzero_p=psi,
+            nonzero_dist=LogNormal.dist(mu=mu, sigma=sigma, tau=tau),
+            dtype="float",
+            **kwargs,
+        )
+
+    @classmethod
+    def dist(cls, psi, mu=0, sigma=None, tau=None, **kwargs):
+        return _hurdle_mixture(
+            name=None,
+            nonzero_p=psi,
+            nonzero_dist=LogNormal.dist(mu=mu, sigma=sigma, tau=tau),
+            dtype="float",
+            **kwargs,
+        )
diff --git a/tests/distributions/test_mixture.py b/tests/distributions/test_mixture.py