Skip to content

Commit 40dfe17

Browse files
committed
merge changes from covid-19 repo updates
1 parent 448bc77 commit 40dfe17

File tree

5 files changed

+163
-114
lines changed

5 files changed

+163
-114
lines changed

doctor_visits/delphi_doctor_visits/config.py

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Author: Maria
55
Created: 2020-04-16
6-
Last modified: 2020-05-12
6+
Last modified: 2020-06-17
77
"""
88

99
from datetime import datetime, timedelta
@@ -14,11 +14,8 @@ class Config:
1414
"""
1515

1616
# dates
17+
FIRST_DATA_DATE = datetime(2020, 1, 1)
1718
DAY_SHIFT = timedelta(days=1) # shift dates forward for labeling purposes
18-
# Feb 1 is when we start producing sensors
19-
FIRST_SENSOR_DATE = datetime(2020, 2, 1) - DAY_SHIFT
20-
# add burn-in sensor dates to calculate direction
21-
BURN_IN_DATE = datetime(2020, 1, 29) - DAY_SHIFT
2219

2320
# data columns
2421
CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
@@ -30,26 +27,15 @@ class Config:
3027
HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
3128
ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
3229
FILT_COLS = ID_COLS + COUNT_COLS
33-
DTYPES = {
34-
"ServiceDate": str,
35-
"PatCountyFIPS": str,
36-
"Denominator": int,
37-
"Flu1": int,
38-
"Covid_like": int,
39-
"Flu_like": int,
40-
"Mixed": int,
41-
"PatAgeGroup": str,
42-
"Pat HRR Name": str,
43-
"Pat HRR ID": float,
44-
}
30+
DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
31+
"Denominator": int, "Flu1": int,
32+
"Covid_like": int, "Flu_like": int,
33+
"Mixed": int, "PatAgeGroup": str,
34+
"Pat HRR Name": str, "Pat HRR ID": float}
4535

4636
SMOOTHER_BANDWIDTH = 100 # bandwidth for the linear left Gaussian filter
47-
MIN_OBS = 2500 # number of total visits needed to produce a sensor
48-
MAX_BACKFILL_WINDOW = (
49-
7 # maximum number of days used to average a backfill correction
50-
)
37+
MAX_BACKFILL_WINDOW = 7 # maximum number of days used to average a backfill correction
5138
MIN_CUM_VISITS = 500 # need to observe at least 500 counts before averaging
5239
RECENT_LENGTH = 7 # number of days to sum over for sparsity threshold
5340
MIN_RECENT_VISITS = 100 # min numbers of visits needed to include estimate
5441
MIN_RECENT_OBS = 3 # minimum days needed to produce an estimate for latest time
55-
assert MIN_OBS >= MIN_CUM_VISITS, "Backfill adjustment not guaranteed to work"

doctor_visits/delphi_doctor_visits/run.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# standard packages
99
import logging
10-
from datetime import datetime
10+
from datetime import datetime, timedelta
1111
from pathlib import Path
1212

1313
#  third party
@@ -23,46 +23,59 @@ def run_module():
2323

2424
logging.basicConfig(level=logging.DEBUG)
2525

26-
## start date will be Jan 1
27-
logging.info("start date:\t%s", params["start_date"])
28-
2926
## get end date from input file
3027
# the filename is expected to be in the format:
3128
# "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
32-
if params["end_date"] == "":
33-
dropdate = str(
34-
datetime.strptime(
35-
Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
36-
).date()
29+
if params["drop_date"] == "":
30+
dropdate_dt = datetime.strptime(
31+
Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
3732
)
3833
else:
39-
dropdate = params["end_date"]
34+
dropdate_dt = datetime.strptime(params["end_date"], "%Y-%m-%d")
35+
dropdate = str(dropdate_dt.date())
4036

41-
logging.info("drop date:\t%s", dropdate)
37+
# range of estimates to produce
38+
n_backfill_days = params["n_backfill_days"] # produce estimates for n_backfill_days
39+
n_waiting_days = params["n_waiting_days"] # most recent n_waiting_days won't be est
40+
enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
41+
startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
42+
enddate = str(enddate_dt.date())
43+
startdate = str(startdate_dt.date())
44+
logging.info(f"drop date:\t\t{dropdate}")
45+
logging.info(f"first sensor date:\t{startdate}")
46+
logging.info(f"last sensor date:\t{enddate}")
47+
logging.info(f"n_backfill_days:\t{n_backfill_days}")
48+
logging.info(f"n_waiting_days:\t{n_waiting_days}")
4249

4350
## geographies
4451
geos = ["state", "msa", "hrr", "county"]
4552

4653
## print out other vars
47-
logging.info("outpath:\t%s", params["export_dir"])
48-
logging.info("parallel:\t%s", params["parallel"])
54+
logging.info("outpath:\t\t%s", params["export_dir"])
55+
logging.info("parallel:\t\t%s", params["parallel"])
56+
logging.info(f"weekday:\t\t%s", params["weekday"])
57+
logging.info(f"write se:\t\t%s", params["se"])
58+
logging.info(f"obfuscated prefix:\t%s", params["obfuscated_prefix"])
4959

5060
## start generating
5161
for geo in geos:
52-
for weekday in [True, False]:
62+
for weekday in params["weekday"]:
5363
if weekday:
5464
logging.info("starting %s, weekday adj", geo)
5565
else:
5666
logging.info("starting %s, no adj", geo)
5767
update_sensor(
58-
params["input_file"],
59-
params["export_dir"],
60-
params["static_file_dir"],
61-
params["start_date"],
62-
dropdate,
63-
geo,
64-
params["parallel"],
65-
weekday,
68+
filepath=params["input_file"],
69+
outpath=params["export_dir"],
70+
staticpath=params["static_file_dir"],
71+
startdate=startdate,
72+
enddate=enddate,
73+
dropdate=dropdate,
74+
geo=geo,
75+
parallel=params["parallel"],
76+
weekday=weekday,
77+
se=params["se"],
78+
prefix=params["obfuscated_prefix"]
6679
)
6780
logging.info("finished %s", geo)
6881

doctor_visits/delphi_doctor_visits/sensor.py

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class DoctorVisitsSensor:
2525

2626
@staticmethod
2727
def transform(
28-
sig, h=Config.SMOOTHER_BANDWIDTH, smoother=left_gauss_linear, base=None
28+
sig, h=Config.SMOOTHER_BANDWIDTH, smoother=left_gauss_linear, base=None
2929
):
3030
"""Transform signal by applying a smoother, and/or adjusting by a base.
3131
@@ -80,12 +80,12 @@ def fill_dates(y_data, dates):
8080

8181
@staticmethod
8282
def backfill(
83-
num,
84-
den,
85-
k=Config.MAX_BACKFILL_WINDOW,
86-
min_visits_to_fill=Config.MIN_CUM_VISITS,
87-
min_visits_to_include=Config.MIN_RECENT_VISITS,
88-
min_recent_obs_to_include=Config.MIN_RECENT_OBS,
83+
num,
84+
den,
85+
k=Config.MAX_BACKFILL_WINDOW,
86+
min_visits_to_fill=Config.MIN_CUM_VISITS,
87+
min_visits_to_include=Config.MIN_RECENT_VISITS,
88+
min_recent_obs_to_include=Config.MIN_RECENT_OBS,
8989
):
9090
"""
9191
Adjust for backfill (retroactively added observations) by using a
@@ -129,17 +129,17 @@ def backfill(
129129
for j in range(p):
130130
new_num[i, j] = revnum[i, j]
131131
else:
132-
den_bin = revden[i : (i + closest_fill_day + 1)]
132+
den_bin = revden[i: (i + closest_fill_day + 1)]
133133
new_den[i] = den_bin.sum()
134134

135135
for j in range(p):
136-
num_bin = revnum[i : (i + closest_fill_day + 1), j]
136+
num_bin = revnum[i: (i + closest_fill_day + 1), j]
137137
new_num[i, j] = num_bin.sum()
138138

139139
# if we do not observe at least min_visits_to_include in the denominator or
140140
# if we observe 0 counts for min_recent_obs window, don't show.
141141
if (new_den[i] < min_visits_to_include) or (
142-
revden[i:][:min_recent_obs_to_include].sum() == 0
142+
revden[i:][:min_recent_obs_to_include].sum() == 0
143143
):
144144
include[i] = False
145145

@@ -156,7 +156,13 @@ def backfill(
156156
return new_num, new_den, include
157157

158158
@staticmethod
159-
def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_obs):
159+
def fit(y_data,
160+
fit_dates,
161+
sensor_dates,
162+
geo_id,
163+
recent_min_visits,
164+
min_recent_obs,
165+
jeffreys):
160166
"""Fitting routine.
161167
162168
Args:
@@ -168,6 +174,9 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
168174
<RECENT_LENGTH> days
169175
min_recent_obs: location is sparse also if it has 0 observations in the
170176
last min_recent_obs days
177+
jeffreys: boolean whether to use Jeffreys estimate for binomial proportion, this
178+
is currently only applied if we are writing SEs out. The estimate is
179+
p_hat = (x + 0.5)/(n + 1).
171180
172181
Returns: dictionary of results
173182
"""
@@ -176,28 +185,43 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
176185
sensor_idxs = np.where(y_data.index >= sensor_dates[0])[0]
177186
n_dates = y_data.shape[0]
178187

188+
# combine Flu_like and Mixed columns
189+
y_data["Flu_like_Mixed"] = y_data["Flu_like"] + y_data["Mixed"]
190+
NEW_CLI_COLS = list(set(Config.CLI_COLS) - {"Flu_like", "Mixed"}) + [
191+
"Flu_like_Mixed"]
192+
193+
# small backfill correction
179194
total_visits = y_data["Denominator"]
180-
total_counts = y_data[Config.CLI_COLS + Config.FLU1_COL]
195+
total_counts = y_data[NEW_CLI_COLS + Config.FLU1_COL]
181196
total_counts, total_visits, include = DoctorVisitsSensor.backfill(
182197
total_counts,
183198
total_visits,
184199
min_visits_to_include=recent_min_visits,
185-
min_recent_obs_to_include=min_recent_obs,
200+
min_recent_obs_to_include=min_recent_obs
186201
)
187-
total_rates = total_counts.div(total_visits, axis=0)
202+
203+
# jeffreys inflation
204+
if jeffreys:
205+
total_counts[NEW_CLI_COLS] = total_counts[NEW_CLI_COLS] + 0.5
206+
total_rates = total_counts.div(total_visits + 1, axis=0)
207+
else:
208+
total_rates = total_counts.div(total_visits, axis=0)
209+
188210
total_rates.fillna(0, inplace=True)
189211
flu1 = total_rates[Config.FLU1_COL]
190212
new_rates = []
191-
for code in Config.CLI_COLS:
213+
for code in NEW_CLI_COLS:
192214
code_vals = total_rates[code]
193215

194216
# if all rates are zero, don't bother
195217
if code_vals.sum() == 0:
218+
if jeffreys:
219+
logging.error("p is 0 even though we used Jefferys estimate")
196220
new_rates.append(np.zeros((n_dates,)))
197221
continue
198222

199223
# include adjustment for flu like codes
200-
base = flu1 if code in ["Flu_like", "Mixed"] else None
224+
base = flu1 if code in ["Flu_like_Mixed"] else None
201225
fitted_codes = DoctorVisitsSensor.transform(
202226
code_vals.values.reshape(-1, 1), base=base
203227
)
@@ -211,9 +235,9 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
211235
den = total_visits[sensor_idxs].values
212236

213237
# calculate standard error
214-
mask = den < 1
215-
se = np.sqrt(np.divide((new_rates * (1 - new_rates)), den, where=den != 0))
216-
se[mask] = np.nan # handle case where we observe no visits
238+
se = np.full_like(new_rates, np.nan)
239+
se[include] = np.sqrt(
240+
np.divide((new_rates[include] * (1 - new_rates[include])), den[include]))
217241

218242
logging.debug(f"{geo_id}: {new_rates[-1]:.3f},[{se[-1]:.3f}]")
219243
return {"geo_id": geo_id, "rate": new_rates, "se": se, "incl": include}

0 commit comments

Comments
 (0)