|
33 | 33 | "import pandas as pd\n",
|
34 | 34 | "import pymc as pm\n",
|
35 | 35 | "\n",
|
36 |
| - "from joblib import Parallel, delayed\n", |
37 | 36 | "from lifelines import KaplanMeierFitter, LogNormalFitter, WeibullFitter\n",
|
38 | 37 | "from lifelines.utils import survival_table_from_events\n",
|
39 | 38 | "from scipy.stats import binom, lognorm, norm, weibull_min"
|
|
114 | 113 | }
|
115 | 114 | ],
|
116 | 115 | "source": [
|
117 |
| - "from scipy.stats import lognorm\n", |
118 |
| - "\n", |
119 | 116 | "mu, sigma = 6, 0.3\n",
|
120 | 117 | "\n",
|
121 | 118 | "\n",
|
122 | 119 | "def plot_ln_pi(mu, sigma, xy=(700, 75), title=\"Exact Prediction Interval for Known Lognormal\"):\n",
|
123 | 120 | " failure_dist = lognorm(s=sigma, scale=np.exp(mu))\n",
|
124 | 121 | " samples = failure_dist.rvs(size=1000, random_state=100)\n",
|
125 |
| - " fig, axs = plt.subplots(1, 3, figsize=(20, 10))\n", |
| 122 | + " fig, axs = plt.subplots(1, 3, figsize=(20, 8))\n", |
126 | 123 | " axs = axs.flatten()\n",
|
127 | 124 | " axs[0].hist(samples, ec=\"black\", color=\"slateblue\", bins=30)\n",
|
128 | 125 | " axs[0].set_title(f\"Failure Time Distribution: LN({mu}, {sigma})\")\n",
|
|
1970 | 1967 | "def bayes_boot(df, lb, ub, seed=100):\n",
|
1971 | 1968 | " w = np.random.dirichlet(np.ones(len(df)), 1)[0]\n",
|
1972 | 1969 | " lnf = LogNormalFitter().fit(df[\"t\"] + 1e-25, df[\"failed\"], weights=w)\n",
|
1973 |
| - " ## Sample random choice from 95% percentile interval of bootstrapped dist\n", |
1974 |
| - " # choices = draws['t'].values\n", |
1975 |
| - " choices = np.linspace(df[\"t\"].min(), df[\"t\"].max(), 1000)\n", |
| 1970 | + " rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))\n", |
| 1971 | + " ## Sample random choice from implied bootstrapped distribution\n", |
| 1972 | + " choices = rv.rvs(1000)\n", |
1976 | 1973 | " future = random.choice(choices)\n",
|
1977 | 1974 | " ## Check if choice is contained within the MLE 95% PI\n",
|
1978 | 1975 | " contained = (future >= lb) & (future <= ub)\n",
|
1979 | 1976 | " ## Record 95% interval of bootstrapped dist\n",
|
1980 |
| - " lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)\n", |
1981 |
| - " ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)\n", |
| 1977 | + " lb = rv.ppf(0.025)\n", |
| 1978 | + " ub = rv.ppf(0.975)\n", |
1982 | 1979 | " return lb, ub, contained, future, lnf.sigma_, lnf.mu_"
|
1983 | 1980 | ]
|
1984 | 1981 | },
|
|
2162 | 2159 | " draws.sort_values(\"t\", inplace=True)\n",
|
2163 | 2160 | " ## Fit Lognormal Dist to\n",
|
2164 | 2161 | " lnf = LogNormalFitter().fit(draws[\"t\"] + 1e-25, draws[\"failed\"])\n",
|
2165 |
| - " ## Sample random choice from 95% percentile interval of bootstrapped dist\n", |
2166 |
| - " # choices = draws['t'].values\n", |
2167 |
| - " ## Essentially sampling from a uniform interval\n", |
2168 |
| - " choices = np.linspace(draws[\"t\"].min(), draws[\"t\"].max(), 1000)\n", |
| 2162 | + " rv = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_))\n", |
| 2163 | + " ## Sample random choice from implied distribution\n", |
| 2164 | + " choices = rv.rvs(1000)\n", |
2169 | 2165 | " future = random.choice(choices)\n",
|
2170 | 2166 | " ## Check if choice is contained within the MLE 95% PI\n",
|
2171 | 2167 | " contained = (future >= lb) & (future <= ub)\n",
|
2172 | 2168 | " ## Record 95% interval of bootstrapped dist\n",
|
2173 |
| - " lb = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.025)\n", |
2174 |
| - " ub = lognorm(s=lnf.sigma_, scale=np.exp(lnf.mu_)).ppf(0.975)\n", |
| 2169 | + " lb = rv.ppf(0.025)\n", |
| 2170 | + " ub = rv.ppf(0.975)\n", |
2175 | 2171 | " return lb, ub, contained, future, lnf.sigma_, lnf.mu_\n",
|
2176 | 2172 | "\n",
|
2177 | 2173 | "\n",
|
|
2186 | 2182 | "cell_type": "markdown",
|
2187 | 2183 | "metadata": {},
|
2188 | 2184 | "source": [
|
2189 |
| - "We can use these bootstrapped statistics to further calculate quantities of the predictive distribution." |
| 2185 | + "We can use these bootstrapped statistics to further calculate quantities of the predictive distribution. In our case we could use the parametric CDF for our simple parametric model, but we'll adopt the empirical cdf here to show how this technique can be used when we have more complicated models too." |
2190 | 2186 | ]
|
2191 | 2187 | },
|
2192 | 2188 | {
|
|
2234 | 2230 | "for i in range(1000):\n",
|
2235 | 2231 | " samples = lognorm(s=draws.iloc[i][\"Sigma\"], scale=np.exp(draws.iloc[i][\"Mu\"])).rvs(1000)\n",
|
2236 | 2232 | " qe, pe = ecdf(samples)\n",
|
2237 |
| - " ax.plot(qe, pe, color=\"grey\", alpha=0.2)\n", |
| 2233 | + " ax.plot(qe, pe, color=\"skyblue\", alpha=0.2)\n", |
2238 | 2234 | " lkup = dict(zip(pe, qe))\n",
|
2239 | 2235 | " hist_data.append([lkup[0.05]])\n",
|
2240 | 2236 | "hist_data = pd.DataFrame(hist_data, columns=[\"p05\"])\n",
|
|
2246 | 2242 | "ax1.hist(hist_data[\"p05\"], color=\"slateblue\", ec=\"black\", alpha=0.4, bins=30)\n",
|
2247 | 2243 | "ax1.set_title(\"Estimate of Uncertainty in the 5% Failure Time\", fontsize=20)\n",
|
2248 | 2244 | "ax1.axvline(\n",
|
2249 |
| - " hist_data[\"p05\"].quantile(0.025), color=\"cyan\", label=\"Lower Bound PI for 5% failure time\"\n", |
| 2245 | + " hist_data[\"p05\"].quantile(0.025), color=\"cyan\", label=\"Lower Bound CI for 5% failure time\"\n", |
2250 | 2246 | ")\n",
|
2251 | 2247 | "ax1.axvline(\n",
|
2252 |
| - " hist_data[\"p05\"].quantile(0.975), color=\"cyan\", label=\"Upper Bound PI for 5% failure time\"\n", |
| 2248 | + " hist_data[\"p05\"].quantile(0.975), color=\"cyan\", label=\"Upper Bound CI for 5% failure time\"\n", |
2253 | 2249 | ")\n",
|
2254 | 2250 | "ax1.legend()\n",
|
2255 | 2251 | "ax.legend();"
|
|
2365 | 2361 | "cell_type": "markdown",
|
2366 | 2362 | "metadata": {},
|
2367 | 2363 | "source": [
|
2368 |
| - "These simulations should be repeated a far larger number of times than we do here. We can also vary the interval size to achieve the desired coverage level." |
| 2364 | + "These simulations should be repeated a far larger number of times than we do here. It should be clear to see how we can also vary the MLE interval size to achieve the desired coverage level." |
2369 | 2365 | ]
|
2370 | 2366 | },
|
2371 | 2367 | {
|
|
7333 | 7329 | "hist_data_info = pd.DataFrame(hist_data_info, columns=[\"p10\", \"p05\"])\n",
|
7334 | 7330 | "draws = pm.draw(pm.Weibull.dist(alpha=np.mean(alphas), beta=np.mean(betas)), 1000)\n",
|
7335 | 7331 | "qe, pe = ecdf(draws)\n",
|
7336 |
| - "ax.plot(qe, pe, color=\"purple\", label=\"Expected CDF Uninformative\")\n", |
| 7332 | + "ax.plot(qe, pe, color=\"purple\", label=\"Expected CDF Uninformative Prior\")\n", |
7337 | 7333 | "draws = pm.draw(\n",
|
7338 | 7334 | " pm.Weibull.dist(alpha=np.mean(alphas_informative), beta=np.mean(betas_informative)), 1000\n",
|
7339 | 7335 | ")\n",
|
7340 | 7336 | "qe, pe = ecdf(draws)\n",
|
7341 |
| - "ax.plot(qe, pe, color=\"magenta\", label=\"Expected CDF Informative\")\n", |
| 7337 | + "ax.plot(qe, pe, color=\"magenta\", label=\"Expected CDF Informative Prior\")\n", |
7342 | 7338 | "ax.plot(\n",
|
7343 | 7339 | " actuarial_table_bearings[\"t\"],\n",
|
7344 | 7340 | " actuarial_table_bearings[\"logit_CI_95_ub\"],\n",
|
|
0 commit comments