Skip to content

Commit ebbba05

Browse files
committed
[Reliability Bayesian pymc-devs#474] fixed empirical coverage estimate
Signed-off-by: Nathaniel <[email protected]>
1 parent 2f9f40a commit ebbba05

File tree

3 files changed

+37
-45
lines changed

3 files changed

+37
-45
lines changed

examples/case_studies/reliability_and_calibrated_prediction.ipynb

+18-22
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
"import pandas as pd\n",
3434
"import pymc as pm\n",
3535
"\n",
36-
"from joblib import Parallel, delayed\n",
3736
"from lifelines import KaplanMeierFitter, LogNormalFitter, WeibullFitter\n",
3837
"from lifelines.utils import survival_table_from_events\n",
3938
"from scipy.stats import binom, lognorm, norm, weibull_min"
@@ -114,15 +113,13 @@
114113
}
115114
],
116115
"source": [
117-
"from scipy.stats import lognorm\n",
118-
"\n",
119116
"mu, sigma = 6, 0.3\n",
120117
"\n",
121118
"\n",
122119
"def plot_ln_pi(mu, sigma, xy=(700, 75), title=\"Exact Prediction Interval for Known Lognormal\"):\n",
123120
" failure_dist = lognorm(s=sigma, scale=np.exp(mu))\n",
124121
" samples = failure_dist.rvs(size=1000, random_state=100)\n",
125-
" fig, axs = plt.subplots(1, 3, figsize=(20, 10))\n",
122+
" fig, axs = plt.subplots(1, 3, figsize=(20, 8))\n",
126123
" axs = axs.flatten()\n",
127124
" axs[0].hist(samples, ec=\"black\", color=\"slateblue\", bins=30)\n",
128125
" axs[0].set_title(f\"Failure Time Distribution: LN({mu}, {sigma})\")\n",
@@ -1970,15 +1967,15 @@
19701967
"def bayes_boot(df, lb, ub, seed=100):\n",
19711968
" w = np.random.dirichlet(np.ones(len(df)), 1)[0]\n",
19721969
" lnf = LogNormalFitter().fit(df[\"t\"] + 1e-25, df[\"failed\"], weights=w)\n",
1973-
" ## Sample random choice from 95% percentile interval of bootstrapped dist\n",
1974-
" # choices = draws['t'].values\n",
1975-
" choices = np.linspace(df[\"t\"].min(), df[\"t\"].max(), 1000)\n",
1970+
" rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))\n",
1971+
" ## Sample random choice from implied bootstrapped distribution\n",
1972+
" choices = rv.rvs(1000)\n",
19761973
" future = random.choice(choices)\n",
19771974
" ## Check if choice is contained within the MLE 95% PI\n",
19781975
" contained = (future >= lb) & (future <= ub)\n",
19791976
" ## Record 95% interval of bootstrapped dist\n",
1980-
" lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)\n",
1981-
" ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)\n",
1977+
" lb = rv.ppf(0.025)\n",
1978+
" ub = rv.ppf(0.975)\n",
19821979
" return lb, ub, contained, future, lnf.sigma_, lnf.mu_"
19831980
]
19841981
},
@@ -2162,16 +2159,15 @@
21622159
" draws.sort_values(\"t\", inplace=True)\n",
21632160
" ## Fit Lognormal Dist to\n",
21642161
" lnf = LogNormalFitter().fit(draws[\"t\"] + 1e-25, draws[\"failed\"])\n",
2165-
" ## Sample random choice from 95% percentile interval of bootstrapped dist\n",
2166-
" # choices = draws['t'].values\n",
2167-
" ## Essentially sampling from a uniform interval\n",
2168-
" choices = np.linspace(draws[\"t\"].min(), draws[\"t\"].max(), 1000)\n",
2162+
" rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))\n",
2163+
" ## Sample random choice from implied distribution\n",
2164+
" choices = rv.rvs(1000)\n",
21692165
" future = random.choice(choices)\n",
21702166
" ## Check if choice is contained within the MLE 95% PI\n",
21712167
" contained = (future >= lb) & (future <= ub)\n",
21722168
" ## Record 95% interval of bootstrapped dist\n",
2173-
" lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)\n",
2174-
" ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)\n",
2169+
" lb = rv.ppf(0.025)\n",
2170+
" ub = rv.ppf(0.975)\n",
21752171
" return lb, ub, contained, future, lnf.sigma_, lnf.mu_\n",
21762172
"\n",
21772173
"\n",
@@ -2186,7 +2182,7 @@
21862182
"cell_type": "markdown",
21872183
"metadata": {},
21882184
"source": [
2189-
"We can use these bootstrapped statistics to further calculate quantities of the predictive distribution."
2185+
"We can use these bootstrapped statistics to further calculate quantities of the predictive distribution. In our case we could use the parametric CDF for our simple parametric model, but we'll adopt the empirical cdf here to show how this technique can be used when we have more complicated models too."
21902186
]
21912187
},
21922188
{
@@ -2234,7 +2230,7 @@
22342230
"for i in range(1000):\n",
22352231
" samples = lognorm(s=draws.iloc[i][\"Sigma\"], scale=np.exp(draws.iloc[i][\"Mu\"])).rvs(1000)\n",
22362232
" qe, pe = ecdf(samples)\n",
2237-
" ax.plot(qe, pe, color=\"grey\", alpha=0.2)\n",
2233+
" ax.plot(qe, pe, color=\"skyblue\", alpha=0.2)\n",
22382234
" lkup = dict(zip(pe, qe))\n",
22392235
" hist_data.append([lkup[0.05]])\n",
22402236
"hist_data = pd.DataFrame(hist_data, columns=[\"p05\"])\n",
@@ -2246,10 +2242,10 @@
22462242
"ax1.hist(hist_data[\"p05\"], color=\"slateblue\", ec=\"black\", alpha=0.4, bins=30)\n",
22472243
"ax1.set_title(\"Estimate of Uncertainty in the 5% Failure Time\", fontsize=20)\n",
22482244
"ax1.axvline(\n",
2249-
" hist_data[\"p05\"].quantile(0.025), color=\"cyan\", label=\"Lower Bound PI for 5% failure time\"\n",
2245+
" hist_data[\"p05\"].quantile(0.025), color=\"cyan\", label=\"Lower Bound CI for 5% failure time\"\n",
22502246
")\n",
22512247
"ax1.axvline(\n",
2252-
" hist_data[\"p05\"].quantile(0.975), color=\"cyan\", label=\"Upper Bound PI for 5% failure time\"\n",
2248+
" hist_data[\"p05\"].quantile(0.975), color=\"cyan\", label=\"Upper Bound CI for 5% failure time\"\n",
22532249
")\n",
22542250
"ax1.legend()\n",
22552251
"ax.legend();"
@@ -2365,7 +2361,7 @@
23652361
"cell_type": "markdown",
23662362
"metadata": {},
23672363
"source": [
2368-
"These simulations should be repeated a far larger number of times than we do here. We can also vary the interval size to achieve the desired coverage level."
2364+
"These simulations should be repeated a far larger number of times than we do here. It should be clear to see how we can also vary the MLE interval size to achieve the desired coverage level."
23692365
]
23702366
},
23712367
{
@@ -7333,12 +7329,12 @@
73337329
"hist_data_info = pd.DataFrame(hist_data_info, columns=[\"p10\", \"p05\"])\n",
73347330
"draws = pm.draw(pm.Weibull.dist(alpha=np.mean(alphas), beta=np.mean(betas)), 1000)\n",
73357331
"qe, pe = ecdf(draws)\n",
7336-
"ax.plot(qe, pe, color=\"purple\", label=\"Expected CDF Uninformative\")\n",
7332+
"ax.plot(qe, pe, color=\"purple\", label=\"Expected CDF Uninformative Prior\")\n",
73377333
"draws = pm.draw(\n",
73387334
" pm.Weibull.dist(alpha=np.mean(alphas_informative), beta=np.mean(betas_informative)), 1000\n",
73397335
")\n",
73407336
"qe, pe = ecdf(draws)\n",
7341-
"ax.plot(qe, pe, color=\"magenta\", label=\"Expected CDF Informative\")\n",
7337+
"ax.plot(qe, pe, color=\"magenta\", label=\"Expected CDF Informative Prior\")\n",
73427338
"ax.plot(\n",
73437339
" actuarial_table_bearings[\"t\"],\n",
73447340
" actuarial_table_bearings[\"logit_CI_95_ub\"],\n",

examples/references.bib

+1-1
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ @book{mcelreath2018statistical
431431
publisher = {Chapman and Hall/CRC}
432432
}
433433
@book{Meeker2021,
434-
author = {Meeker, William},
434+
author = {Escobar, L.A. and Meeker, W.Q. and Pascual, Francis},
435435
publisher = {Wiley},
436436
title = {Statistical Methods for Reliability Data},
437437
year = {2021}

myst_nbs/case_studies/reliability_and_calibrated_prediction.myst.md

+18-22
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ import numpy as np
3434
import pandas as pd
3535
import pymc as pm
3636
37-
from joblib import Parallel, delayed
3837
from lifelines import KaplanMeierFitter, LogNormalFitter, WeibullFitter
3938
from lifelines.utils import survival_table_from_events
4039
from scipy.stats import binom, lognorm, norm, weibull_min
@@ -81,15 +80,13 @@ Throughout the focus will be how the understanding of the CDF can help us unders
8180
In the study of reliability statistics there is a focus on location-scale based distributions with long tails. In an ideal world we'd know exactly which distribution described our failure process and the prediction interval for the next failure could be defined exactly.
8281

8382
```{code-cell} ipython3
84-
from scipy.stats import lognorm
85-
8683
mu, sigma = 6, 0.3
8784
8885
8986
def plot_ln_pi(mu, sigma, xy=(700, 75), title="Exact Prediction Interval for Known Lognormal"):
9087
failure_dist = lognorm(s=sigma, scale=np.exp(mu))
9188
samples = failure_dist.rvs(size=1000, random_state=100)
92-
fig, axs = plt.subplots(1, 3, figsize=(20, 10))
89+
fig, axs = plt.subplots(1, 3, figsize=(20, 8))
9390
axs = axs.flatten()
9491
axs[0].hist(samples, ec="black", color="slateblue", bins=30)
9592
axs[0].set_title(f"Failure Time Distribution: LN({mu}, {sigma})")
@@ -422,15 +419,15 @@ The second method we'll use to assess coverage is to bootstrap estimates of a 95
422419
def bayes_boot(df, lb, ub, seed=100):
423420
w = np.random.dirichlet(np.ones(len(df)), 1)[0]
424421
lnf = LogNormalFitter().fit(df["t"] + 1e-25, df["failed"], weights=w)
425-
## Sample random choice from 95% percentile interval of bootstrapped dist
426-
# choices = draws['t'].values
427-
choices = np.linspace(df["t"].min(), df["t"].max(), 1000)
422+
rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))
423+
## Sample random choice from implied bootstrapped distribution
424+
choices = rv.rvs(1000)
428425
future = random.choice(choices)
429426
## Check if choice is contained within the MLE 95% PI
430427
contained = (future >= lb) & (future <= ub)
431428
## Record 95% interval of bootstrapped dist
432-
lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)
433-
ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)
429+
lb = rv.ppf(0.025)
430+
ub = rv.ppf(0.975)
434431
return lb, ub, contained, future, lnf.sigma_, lnf.mu_
435432
```
436433

@@ -440,16 +437,15 @@ def bootstrap(df, lb, ub, seed=100):
440437
draws.sort_values("t", inplace=True)
441438
## Fit Lognormal Dist to
442439
lnf = LogNormalFitter().fit(draws["t"] + 1e-25, draws["failed"])
443-
## Sample random choice from 95% percentile interval of bootstrapped dist
444-
# choices = draws['t'].values
445-
## Essentially sampling from a uniform interval
446-
choices = np.linspace(draws["t"].min(), draws["t"].max(), 1000)
440+
rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))
441+
## Sample random choice from implied distribution
442+
choices = rv.rvs(1000)
447443
future = random.choice(choices)
448444
## Check if choice is contained within the MLE 95% PI
449445
contained = (future >= lb) & (future <= ub)
450446
## Record 95% interval of bootstrapped dist
451-
lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)
452-
ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)
447+
lb = rv.ppf(0.025)
448+
ub = rv.ppf(0.975)
453449
return lb, ub, contained, future, lnf.sigma_, lnf.mu_
454450
455451
@@ -460,7 +456,7 @@ draws = pd.DataFrame(
460456
draws
461457
```
462458

463-
We can use these bootstrapped statistics to further calculate quantities of the predictive distribution.
459+
We can use these bootstrapped statistics to further calculate quantities of the predictive distribution. In our case we could use the parametric CDF for our simple parametric model, but we'll adopt the empirical cdf here to show how this technique can be used when we have more complicated models too.
464460

465461
```{code-cell} ipython3
466462
def ecdf(sample):
@@ -486,7 +482,7 @@ hist_data = []
486482
for i in range(1000):
487483
samples = lognorm(s=draws.iloc[i]["Sigma"], scale=np.exp(draws.iloc[i]["Mu"])).rvs(1000)
488484
qe, pe = ecdf(samples)
489-
ax.plot(qe, pe, color="grey", alpha=0.2)
485+
ax.plot(qe, pe, color="skyblue", alpha=0.2)
490486
lkup = dict(zip(pe, qe))
491487
hist_data.append([lkup[0.05]])
492488
hist_data = pd.DataFrame(hist_data, columns=["p05"])
@@ -498,10 +494,10 @@ ax.set_title("Bootstrapped CDF functions for the Shock Absorbers Data", fontsize
498494
ax1.hist(hist_data["p05"], color="slateblue", ec="black", alpha=0.4, bins=30)
499495
ax1.set_title("Estimate of Uncertainty in the 5% Failure Time", fontsize=20)
500496
ax1.axvline(
501-
hist_data["p05"].quantile(0.025), color="cyan", label="Lower Bound PI for 5% failure time"
497+
hist_data["p05"].quantile(0.025), color="cyan", label="Lower Bound CI for 5% failure time"
502498
)
503499
ax1.axvline(
504-
hist_data["p05"].quantile(0.975), color="cyan", label="Upper Bound PI for 5% failure time"
500+
hist_data["p05"].quantile(0.975), color="cyan", label="Upper Bound CI for 5% failure time"
505501
)
506502
ax1.legend()
507503
ax.legend();
@@ -587,7 +583,7 @@ axs[2].annotate(
587583
);
588584
```
589585

590-
These simulations should be repeated a far larger number of times than we do here. We can also vary the interval size to achieve the desired coverage level.
586+
These simulations should be repeated a far larger number of times than we do here. It should be clear to see how we can also vary the MLE interval size to achieve the desired coverage level.
591587

592588
+++
593589

@@ -962,12 +958,12 @@ hist_data = pd.DataFrame(hist_data, columns=["p10", "p05"])
962958
hist_data_info = pd.DataFrame(hist_data_info, columns=["p10", "p05"])
963959
draws = pm.draw(pm.Weibull.dist(alpha=np.mean(alphas), beta=np.mean(betas)), 1000)
964960
qe, pe = ecdf(draws)
965-
ax.plot(qe, pe, color="purple", label="Expected CDF Uninformative")
961+
ax.plot(qe, pe, color="purple", label="Expected CDF Uninformative Prior")
966962
draws = pm.draw(
967963
pm.Weibull.dist(alpha=np.mean(alphas_informative), beta=np.mean(betas_informative)), 1000
968964
)
969965
qe, pe = ecdf(draws)
970-
ax.plot(qe, pe, color="magenta", label="Expected CDF Informative")
966+
ax.plot(qe, pe, color="magenta", label="Expected CDF Informative Prior")
971967
ax.plot(
972968
actuarial_table_bearings["t"],
973969
actuarial_table_bearings["logit_CI_95_ub"],

0 commit comments

Comments
 (0)