|
3 | 3 | Functions to help generate sensor for different geographical levels
|
4 | 4 | """
|
5 | 5 | import pandas as pd
|
6 |
| -from .data_tools import fill_dates, raw_positive_prop, smoothed_positive_prop |
| 6 | +from .data_tools import (fill_dates, raw_positive_prop, |
| 7 | + smoothed_positive_prop, |
| 8 | + smoothed_tests_per_device, |
| 9 | + raw_tests_per_device) |
7 | 10 |
|
8 | 11 | MIN_OBS = 50 # minimum number of observations in order to compute a proportion.
|
9 | 12 | POOL_DAYS = 7
|
10 | 13 |
|
11 |
| -def generate_sensor_for_states(state_data, smooth, first_date, last_date): |
| 14 | +def generate_sensor_for_states(state_groups, smooth, device, first_date, last_date): |
12 | 15 | """
|
13 | 16 | fit over states
|
14 | 17 | Args:
|
15 |
| - state_data: pd.DataFrame |
| 18 | + state_groups: pd.groupby.generic.DataFrameGroupBy |
16 | 19 | state_key: "state_id"
|
17 | 20 | smooth: bool
|
| 21 | + Consider raw or smooth |
| 22 | + device: bool |
| 23 | + Consider test_per_device or pct_positive |
18 | 24 | Returns:
|
19 | 25 | df: pd.DataFrame
|
20 | 26 | """
|
21 | 27 | state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"])
|
22 |
| - state_groups = state_data.groupby("state_id") |
23 | 28 | state_list = list(state_groups.groups.keys())
|
24 | 29 | for state in state_list:
|
25 | 30 | state_group = state_groups.get_group(state)
|
26 | 31 | state_group = state_group.drop(columns=["state_id"])
|
27 | 32 | state_group.set_index("timestamp", inplace=True)
|
28 | 33 | state_group = fill_dates(state_group, first_date, last_date)
|
29 | 34 |
|
30 |
| - if smooth: |
31 |
| - stat, se, sample_size = smoothed_positive_prop(tests=state_group['totalTest'].values, |
32 |
| - positives=state_group['positiveTest'].values, |
33 |
| - min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 35 | + # smoothed test per device |
| 36 | + if device & smooth: |
| 37 | + stat, se, sample_size = smoothed_tests_per_device( |
| 38 | + devices=state_group["numUniqueDevices"].values, |
| 39 | + tests=state_group['totalTest'].values, |
| 40 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 41 | + # raw test per device |
| 42 | + elif device & (not smooth): |
| 43 | + stat, se, sample_size = raw_tests_per_device( |
| 44 | + devices=state_group["numUniqueDevices"].values, |
| 45 | + tests=state_group['totalTest'].values, |
| 46 | + min_obs=MIN_OBS) |
| 47 | + # smoothed pct positive |
| 48 | + elif (not device) & smooth: |
| 49 | + stat, se, sample_size = smoothed_positive_prop( |
| 50 | + tests=state_group['totalTest'].values, |
| 51 | + positives=state_group['positiveTest'].values, |
| 52 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 53 | + stat = stat * 100 |
| 54 | + # raw pct positive |
34 | 55 | else:
|
35 |
| - stat, se, sample_size = raw_positive_prop(tests=state_group['totalTest'].values, |
36 |
| - positives=state_group['positiveTest'].values, |
37 |
| - min_obs=MIN_OBS) |
38 |
| - stat = stat * 100 |
| 56 | + stat, se, sample_size = raw_positive_prop( |
| 57 | + tests=state_group['totalTest'].values, |
| 58 | + positives=state_group['positiveTest'].values, |
| 59 | + min_obs=MIN_OBS) |
| 60 | + stat = stat * 100 |
| 61 | + |
39 | 62 | se = se * 100
|
40 | 63 | state_df = state_df.append(pd.DataFrame({"geo_id": state,
|
41 | 64 | "timestamp": state_group.index,
|
42 | 65 | "val": stat,
|
43 | 66 | "se": se,
|
44 | 67 | "sample_size": sample_size}))
|
45 |
| - return state_df, state_groups |
| 68 | + return state_df |
46 | 69 |
|
47 |
| -def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, first_date, last_date): |
| 70 | +def generate_sensor_for_other_geores(state_groups, data, res_key, smooth, |
| 71 | + device, first_date, last_date): |
48 | 72 | """
|
49 | 73 | fit over counties/HRRs/MSAs
|
50 | 74 | Args:
|
51 | 75 | data: pd.DataFrame
|
52 | 76 | res_key: "fips", "cbsa_id" or "hrrnum"
|
53 | 77 | smooth: bool
|
| 78 | + Consider raw or smooth |
| 79 | + device: bool |
| 80 | + Consider test_per_device or pct_positive |
54 | 81 | Returns:
|
55 | 82 | df: pd.DataFrame
|
56 | 83 | """
|
| 84 | + has_parent = True |
57 | 85 | res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
|
58 | 86 | res_groups = data.groupby(res_key)
|
59 | 87 | loc_list = list(res_groups.groups.keys())
|
60 | 88 | for loc in loc_list:
|
61 | 89 | res_group = res_groups.get_group(loc)
|
62 | 90 | parent_state = res_group['state_id'].values[0]
|
63 |
| - parent_group = state_groups.get_group(parent_state) |
64 |
| - res_group = res_group.merge(parent_group, how="left", |
65 |
| - on="timestamp", suffixes=('', '_parent')) |
66 |
| - res_group = res_group.drop(columns=[res_key, "state_id", "state_id" + '_parent']) |
| 91 | + try: |
| 92 | + parent_group = state_groups.get_group(parent_state) |
| 93 | + res_group = res_group.merge(parent_group, how="left", |
| 94 | + on="timestamp", suffixes=('', '_parent')) |
| 95 | + res_group = res_group.drop(columns=[res_key, "state_id", "state_id" + '_parent']) |
| 96 | + except: |
| 97 | + has_parent = False |
| 98 | + res_group = res_group.drop(columns=[res_key, "state_id"]) |
67 | 99 | res_group.set_index("timestamp", inplace=True)
|
68 | 100 | res_group = fill_dates(res_group, first_date, last_date)
|
69 | 101 |
|
70 | 102 | if smooth:
|
71 |
| - stat, se, sample_size = smoothed_positive_prop( |
72 |
| - tests=res_group['totalTest'].values, |
73 |
| - positives=res_group['positiveTest'].values, |
74 |
| - min_obs=MIN_OBS, pool_days=POOL_DAYS, |
75 |
| - parent_tests=res_group["totalTest_parent"].values, |
76 |
| - parent_positives=res_group['positiveTest_parent'].values) |
| 103 | + if has_parent: |
| 104 | + if device: |
| 105 | + stat, se, sample_size = smoothed_tests_per_device( |
| 106 | + devices=res_group["numUniqueDevices"].values, |
| 107 | + tests=res_group['totalTest'].values, |
| 108 | + min_obs=MIN_OBS, pool_days=POOL_DAYS, |
| 109 | + parent_devices=res_group["numUniqueDevices_parent"].values, |
| 110 | + parent_tests=res_group["totalTest_parent"].values) |
| 111 | + else: |
| 112 | + stat, se, sample_size = smoothed_positive_prop( |
| 113 | + tests=res_group['totalTest'].values, |
| 114 | + positives=res_group['positiveTest'].values, |
| 115 | + min_obs=MIN_OBS, pool_days=POOL_DAYS, |
| 116 | + parent_tests=res_group["totalTest_parent"].values, |
| 117 | + parent_positives=res_group['positiveTest_parent'].values) |
| 118 | + stat = stat * 100 |
| 119 | + else: |
| 120 | + if device: |
| 121 | + stat, se, sample_size = smoothed_tests_per_device( |
| 122 | + devices=res_group["numUniqueDevices"].values, |
| 123 | + tests=res_group['totalTest'].values, |
| 124 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 125 | + else: |
| 126 | + stat, se, sample_size = smoothed_positive_prop( |
| 127 | + tests=res_group['totalTest'].values, |
| 128 | + positives=res_group['positiveTest'].values, |
| 129 | + min_obs=MIN_OBS, pool_days=POOL_DAYS) |
| 130 | + stat = stat * 100 |
77 | 131 | else:
|
78 |
| - stat, se, sample_size = raw_positive_prop( |
79 |
| - tests=res_group['totalTest'].values, |
80 |
| - positives=res_group['positiveTest'].values, |
81 |
| - min_obs=MIN_OBS) |
82 |
| - stat = stat * 100 |
83 |
| - se = se * 100 |
| 132 | + if device: |
| 133 | + stat, se, sample_size = raw_tests_per_device( |
| 134 | + devices=res_group["numUniqueDevices"].values, |
| 135 | + tests=res_group['totalTest'].values, |
| 136 | + min_obs=MIN_OBS) |
| 137 | + else: |
| 138 | + stat, se, sample_size = raw_positive_prop( |
| 139 | + tests=res_group['totalTest'].values, |
| 140 | + positives=res_group['positiveTest'].values, |
| 141 | + min_obs=MIN_OBS) |
| 142 | + stat = stat * 100 |
84 | 143 |
|
| 144 | + se = se * 100 |
85 | 145 | res_df = res_df.append(pd.DataFrame({"geo_id": loc,
|
86 | 146 | "timestamp": res_group.index,
|
87 | 147 | "val": stat,
|
|
0 commit comments