Skip to content

Commit a03ca61

Browse files
committed
update bayesian predictions to use einstats
1 parent d1854ac commit a03ca61

File tree

2 files changed

+1159
-749
lines changed

2 files changed

+1159
-749
lines changed

examples/case_studies/reliability_and_calibrated_prediction.ipynb

+1,118-690
Large diffs are not rendered by default.

examples/case_studies/reliability_and_calibrated_prediction.myst.md

+41-59
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ jupytext:
55
format_name: myst
66
format_version: 0.13
77
kernelspec:
8-
display_name: Python [conda env:pymc_ar_ex] *
8+
display_name: Python 3 (ipykernel)
99
language: python
10-
name: conda-env-pymc_ar_ex-py
10+
name: python3
1111
substitutions:
1212
extra_dependencies: lifelines
1313
---
@@ -21,6 +21,9 @@ substitutions:
2121
:author: Nathaniel Forde
2222
:::
2323

24+
:::{include} ../extra_installs.md
25+
:::
26+
2427
```{code-cell} ipython3
2528
import os
2629
import random
@@ -358,9 +361,9 @@ def plot_cdfs(actuarial_table, dist_fits=True, ax=None, title="", xy=(3000, 0.5)
358361
)
359362
ax.plot(actuarial_table["t"], actuarial_table["CI_95_ub"], color="darkorchid", linestyle="--")
360363
ax.fill_between(
361-
actuarial_table["t"],
362-
actuarial_table["CI_95_lb"],
363-
actuarial_table["CI_95_ub"],
364+
actuarial_table["t"].values,
365+
actuarial_table["CI_95_lb"].values,
366+
actuarial_table["CI_95_ub"].values,
364367
color="darkorchid",
365368
alpha=0.2,
366369
)
@@ -375,9 +378,9 @@ def plot_cdfs(actuarial_table, dist_fits=True, ax=None, title="", xy=(3000, 0.5)
375378
actuarial_table["t"], actuarial_table["logit_CI_95_ub"], color="royalblue", linestyle="--"
376379
)
377380
ax.fill_between(
378-
actuarial_table["t"],
379-
actuarial_table["logit_CI_95_lb"],
380-
actuarial_table["logit_CI_95_ub"],
381+
actuarial_table["t"].values,
382+
actuarial_table["logit_CI_95_lb"].values,
383+
actuarial_table["logit_CI_95_ub"].values,
381384
color="royalblue",
382385
alpha=0.2,
383386
)
@@ -401,7 +404,7 @@ def plot_cdfs(actuarial_table, dist_fits=True, ax=None, title="", xy=(3000, 0.5)
401404
```
402405

403406
```{code-cell} ipython3
404-
plot_cdfs(actuarial_table_shock, title="Shock Absorber")
407+
plot_cdfs(actuarial_table_shock, title="Shock Absorber");
405408
```
406409

407410
This shows a good fit to the data and implies, as you might expect, that the failing fraction of shock absorbers increases with age as they wear out. But how do we quantify the prediction given an estimated model?
@@ -444,23 +447,6 @@ def bayes_boot(df, lb, ub, seed=100):
444447
```
445448

446449
```{code-cell} ipython3
447-
def bootstrap(df, lb, ub, seed=100):
448-
draws = df[["t", "failed"]].sample(replace=True, frac=1, random_state=seed)
449-
draws.sort_values("t", inplace=True)
450-
## Fit Lognormal Dist to
451-
lnf = LogNormalFitter().fit(draws["t"] + 1e-25, draws["failed"])
452-
rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))
453-
## Sample random choice from implied distribution
454-
choices = rv.rvs(1000)
455-
future = random.choice(choices)
456-
## Check if choice is contained within the MLE 95% PI
457-
contained = (future >= lb) & (future <= ub)
458-
## Record 95% interval of bootstrapped dist
459-
lb = rv.ppf(0.025)
460-
ub = rv.ppf(0.975)
461-
return lb, ub, contained, future, lnf.sigma_, lnf.mu_
462-
463-
464450
CIs = [bayes_boot(actuarial_table_shock, 8928, 70188, seed=i) for i in range(1000)]
465451
draws = pd.DataFrame(
466452
CIs, columns=["Lower Bound PI", "Upper Bound PI", "Contained", "future", "Sigma", "Mu"]
@@ -953,15 +939,11 @@ az.summary(idata_informative)
953939
```
954940

955941
```{code-cell} ipython3
956-
joint_draws = az.extract_dataset(idata, group="posterior", num_samples=1000)[
957-
["alpha", "beta"]
958-
].to_dataframe()
942+
joint_draws = az.extract(idata, num_samples=1000)[["alpha", "beta"]]
959943
alphas = joint_draws["alpha"].values
960944
betas = joint_draws["beta"].values
961945
962-
joint_draws_informative = az.extract_dataset(
963-
idata_informative, group="posterior", num_samples=1000
964-
)[["alpha", "beta"]].to_dataframe()
946+
joint_draws_informative = az.extract(idata_informative, num_samples=1000)[["alpha", "beta"]]
965947
alphas_informative = joint_draws_informative["alpha"].values
966948
betas_informative = joint_draws_informative["beta"].values
967949
@@ -1085,24 +1067,24 @@ print("Upper Bound 95% PI:", binom(1700, rho).ppf(0.95))
10851067
We'll use the posterior predictive distribution of the uniformative model. We show here how to derive the uncertainty in the estimates of the 95% prediction interval for the number of failures in a time interval. As we saw above the MLE alternative to this procedure is to generate a predictive distribution from bootstrap sampling. The bootstrap procedure tends to agree with the plug-in procedure using the MLE estimates and lacks the flexibility of specifying prior information.
10861068

10871069
```{code-cell} ipython3
1070+
import xarray as xr
1071+
1072+
from xarray_einstats.stats import XrContinuousRV, XrDiscreteRV
1073+
1074+
10881075
def PI_failures(joint_draws, lp, up, n_at_risk):
1089-
records = []
1090-
alphas = joint_draws["alpha"].values
1091-
betas = joint_draws["beta"].values
1092-
for i in range(len(joint_draws)):
1093-
fit = weibull_min(c=alphas[i], scale=betas[i])
1094-
rho = fit.cdf(up) - fit.cdf(lp) / (1 - fit.cdf(lp))
1095-
lb = binom(n_at_risk, rho).ppf(0.05)
1096-
ub = binom(n_at_risk, rho).ppf(0.95)
1097-
point_prediction = n_at_risk * rho
1098-
records.append([alphas[i], betas[i], rho, n_at_risk, lb, ub, point_prediction])
1099-
return pd.DataFrame(
1100-
records, columns=["alpha", "beta", "rho", "n_at_risk", "lb", "ub", "expected"]
1076+
fit = XrContinuousRV(weibull_min, joint_draws["alpha"], scale=joint_draws["beta"])
1077+
rho = fit.cdf(up) - fit.cdf(lp) / (1 - fit.cdf(lp))
1078+
lub = XrDiscreteRV(binom, n_at_risk, rho).ppf([0.05, 0.95])
1079+
lb, ub = lub.sel(quantile=0.05, drop=True), lub.sel(quantile=0.95, drop=True)
1080+
point_prediction = n_at_risk * rho
1081+
return xr.Dataset(
1082+
{"rho": rho, "n_at_risk": n_at_risk, "lb": lb, "ub": ub, "expected": point_prediction}
11011083
)
11021084
11031085
1104-
output_df = PI_failures(joint_draws, 150, 600, 1700)
1105-
output_df
1086+
output_ds = PI_failures(joint_draws, 150, 600, 1700)
1087+
output_ds
11061088
```
11071089

11081090
```{code-cell} ipython3
@@ -1120,9 +1102,9 @@ ax1 = axs[1]
11201102
ax2 = axs[2]
11211103
11221104
ax.scatter(
1123-
output_df["alpha"],
1124-
output_df["expected"],
1125-
c=output_df["beta"],
1105+
joint_draws["alpha"],
1106+
output_ds["expected"],
1107+
c=joint_draws["beta"],
11261108
cmap=cm.cool,
11271109
alpha=0.3,
11281110
label="Coloured by function of Beta values",
@@ -1136,39 +1118,39 @@ ax.set_title(
11361118
)
11371119
11381120
ax1.hist(
1139-
output_df["lb"],
1121+
output_ds["lb"],
11401122
ec="black",
11411123
color="slateblue",
11421124
label="95% PI Lower Bound on Failure Count",
11431125
alpha=0.3,
11441126
)
1145-
ax1.axvline(output_df["lb"].mean(), label="Expected 95% PI Lower Bound on Failure Count")
1146-
ax1.axvline(output_df["ub"].mean(), label="Expected 95% PI Upper Bound on Failure Count")
1127+
ax1.axvline(output_ds["lb"].mean(), label="Expected 95% PI Lower Bound on Failure Count")
1128+
ax1.axvline(output_ds["ub"].mean(), label="Expected 95% PI Upper Bound on Failure Count")
11471129
ax1.hist(
1148-
output_df["ub"],
1130+
output_ds["ub"],
11491131
ec="black",
11501132
color="cyan",
11511133
label="95% PI Upper Bound on Failure Count",
11521134
bins=20,
11531135
alpha=0.3,
11541136
)
11551137
ax1.hist(
1156-
output_df["expected"], ec="black", color="pink", label="Expected Count of Failures", bins=20
1138+
output_ds["expected"], ec="black", color="pink", label="Expected Count of Failures", bins=20
11571139
)
11581140
ax1.set_title("Uncertainty in the Posterior Prediction Interval of Failure Counts", fontsize=20)
11591141
ax1.legend()
11601142
11611143
ax2.set_title("Expected Costs Distribution(s) \nbased on implied Failure counts", fontsize=20)
11621144
ax2.hist(
1163-
cost_func(output_df["expected"], 2.3),
1145+
cost_func(output_ds["expected"], 2.3),
11641146
label="Cost(failures,2)",
11651147
color="royalblue",
11661148
alpha=0.3,
11671149
ec="black",
11681150
bins=20,
11691151
)
11701152
ax2.hist(
1171-
cost_func(output_df["expected"], 2),
1153+
cost_func(output_ds["expected"], 2),
11721154
label="Cost(failures,2.3)",
11731155
color="red",
11741156
alpha=0.5,
@@ -1177,7 +1159,7 @@ ax2.hist(
11771159
)
11781160
ax2.set_xlabel("$ cost")
11791161
# ax2.set_xlim(-60, 0)
1180-
ax2.legend()
1162+
ax2.legend();
11811163
```
11821164

11831165
The choice of model in such cases is crucial. The decision about which failure profile is apt has to be informed by a subject matter expert because extrapolation from such sparse data is always risky. An understanding of the uncertainty is crucial if real costs attach to the failures and the subject matter expert is usually better placed to tell if you 2 or 7 failures can be plausibly expected within 600 hours of service.
@@ -1193,7 +1175,7 @@ In particular we've seen how the MLE fits to our bearings data provide a decent
11931175

11941176
## Authors
11951177

1196-
* Authored by Nathaniel Forde on 9th of January 2022 ([pymc-examples491](https://github.com/pymc-devs/pymc-examples/pull/491))
1178+
* Authored by Nathaniel Forde on 9th of January 2022 ([pymc-examples#491](https://github.com/pymc-devs/pymc-examples/pull/491))
11971179

11981180
+++
11991181

@@ -1209,7 +1191,7 @@ In particular we've seen how the MLE fits to our bearings data provide a decent
12091191

12101192
```{code-cell} ipython3
12111193
%load_ext watermark
1212-
%watermark -n -u -v -iv -w -p pytensor
1194+
%watermark -n -u -v -iv -w -p pytensor,xarray_einstats
12131195
```
12141196

12151197
:::{include} ../page_footer.md

0 commit comments

Comments
 (0)