Skip to content

Sensorization #568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: dv-package
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions doctor_visits/delphi_doctor_visits/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,24 @@ class Config:
CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
FLU1_COL = ["Flu1"]
COUNT_COLS = CLI_COLS + FLU1_COL + ["Denominator"]
DATE_COL = "ServiceDate"
GEO_COL = "PatCountyFIPS"
AGE_COL = "PatAgeGroup"
HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
DATE_COL = "ServiceDate" #"servicedate"
GEO_COL = "PatCountyFIPS" #"patCountyFIPS"
AGE_COL = "PatAgeGroup" #"patAgeGroup"
HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]#["patHRRname", "patHRRid"]
Comment on lines +24 to +27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are these comments for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point, HSP changed the column names of the files they were sending us. I was testing the code on a recent drop (with lowercase column names). The code originally had the uppercase names, so I changed them back to pass the tests and to merge the PR.

ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
FILT_COLS = ID_COLS + COUNT_COLS
DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
DTYPES = {DATE_COL: str, GEO_COL: str,
"Denominator": int, "Flu1": int,
"Covid_like": int, "Flu_like": int,
"Mixed": int, "PatAgeGroup": str,
"Pat HRR Name": str, "Pat HRR ID": float}
"Mixed": int, AGE_COL: str,
HRR_COLS[0]: str, HRR_COLS[1]: float}

SMOOTHER_BANDWIDTH = 100 # bandwidth for the linear left Gaussian filter
MAX_BACKFILL_WINDOW = 7 # maximum number of days used to average a backfill correction
MIN_CUM_VISITS = 500 # need to observe at least 500 counts before averaging
RECENT_LENGTH = 7 # number of days to sum over for sparsity threshold
MIN_RECENT_VISITS = 100 # min numbers of visits needed to include estimate
MIN_RECENT_OBS = 3 # minimum days needed to produce an estimate for latest time

SENSOR_WINDOW_START = None # 7 # start of training window for sensorization
SENSOR_WINDOW_END = 42 # end of training window for sensorization
29 changes: 15 additions & 14 deletions doctor_visits/delphi_doctor_visits/geo_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ def county_to_msa(self, data):
data = self.gmpr.add_geocode(data,
"fips",
"msa",
from_col="PatCountyFIPS",
from_col=Config.GEO_COL,
new_col="cbsa_id")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()
data.drop(columns=Config.GEO_COL, inplace=True)
data = data.groupby([Config.DATE_COL, "cbsa_id"]).sum().reset_index()

return data.groupby("cbsa_id"), "cbsa_id"

Expand All @@ -58,9 +58,9 @@ def county_to_state(self, data):
data = self.gmpr.add_geocode(data,
"fips",
"state_id",
from_col="PatCountyFIPS")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()
from_col=Config.GEO_COL)
data.drop(columns=Config.GEO_COL, inplace=True)
data = data.groupby([Config.DATE_COL, "state_id"]).sum().reset_index()

return data.groupby("state_id"), "state_id"

Expand All @@ -81,11 +81,11 @@ def county_to_hrr(self, data):
data = self.gmpr.add_geocode(data,
"fips",
"hrr",
from_col="PatCountyFIPS")
data.drop(columns="PatCountyFIPS", inplace=True)
from_col=Config.GEO_COL)
data.drop(columns=Config.GEO_COL, inplace=True)

## do a weighted sum by the wpop column to get each HRR's contribution
tmp = data.groupby(["ServiceDate", "hrr"])
tmp = data.groupby([Config.DATE_COL, "hrr"])
wtsum = lambda g: g["weight"].values @ g[Config.COUNT_COLS]
data = tmp.apply(wtsum).reset_index()

Expand All @@ -101,14 +101,15 @@ def county_to_megacounty(self, data, threshold_visits, threshold_len):

Returns: tuple of dataframe at the daily-state resolution, and geo_id column name
"""

all_data = self.gmpr.fips_to_megacounty(data,
threshold_visits,
threshold_len,
fips_col="PatCountyFIPS",
fips_col=Config.GEO_COL,
thr_col="Denominator",
date_col="ServiceDate")
all_data.rename({"megafips": "PatCountyFIPS"}, axis=1, inplace=True)
megacounties = all_data[all_data.PatCountyFIPS.str.endswith("000")]
date_col=Config.DATE_COL)
all_data.rename({"megafips": Config.GEO_COL}, axis=1, inplace=True)
megacounties = all_data[all_data[Config.GEO_COL].str.endswith("000")]
data = pd.concat([data, megacounties])

return data.groupby("PatCountyFIPS"), "PatCountyFIPS"
return data.groupby(Config.GEO_COL), Config.GEO_COL
51 changes: 28 additions & 23 deletions doctor_visits/delphi_doctor_visits/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

# first party
from .update_sensor import update_sensor
from .config import Config


def run_module():
Expand All @@ -25,10 +26,10 @@ def run_module():

## get end date from input file
# the filename is expected to be in the format:
# "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
# "EDI_AGG_OUTPATIENT_YYYYMMDD_HHMM{timezone}.csv.gz"
if params["drop_date"] == "":
dropdate_dt = datetime.strptime(
Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
Path(params["input_file"]).name.split("_")[3], "%Y%m%d"
)
else:
dropdate_dt = datetime.strptime(params["drop_date"], "%Y-%m-%d")
Expand All @@ -38,7 +39,8 @@ def run_module():
n_backfill_days = params["n_backfill_days"] # produce estimates for n_backfill_days
n_waiting_days = params["n_waiting_days"] # most recent n_waiting_days won't be est
enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
startdate_dt = max(enddate_dt - timedelta(days=n_backfill_days),\
Config.FIRST_DATA_DATE + timedelta(days=1) + Config.DAY_SHIFT)
enddate = str(enddate_dt.date())
startdate = str(startdate_dt.date())
logging.info(f"drop date:\t\t{dropdate}")
Expand All @@ -53,30 +55,33 @@ def run_module():
## print out other vars
logging.info("outpath:\t\t%s", params["export_dir"])
logging.info("parallel:\t\t%s", params["parallel"])
logging.info(f"weekday:\t\t%s", params["weekday"])
logging.info(f"write se:\t\t%s", params["se"])
logging.info(f"obfuscated prefix:\t%s", params["obfuscated_prefix"])
logging.info("weekday:\t\t%s", params["weekday"])
logging.info("write se:\t\t%s", params["se"])
logging.info("obfuscated prefix:\t%s", params["obfuscated_prefix"])

## start generating
for geo in geos:
for weekday in params["weekday"]:
if weekday:
logging.info("starting %s, weekday adj", geo)
else:
logging.info("starting %s, no adj", geo)
update_sensor(
filepath=params["input_file"],
outpath=params["export_dir"],
staticpath=params["static_file_dir"],
startdate=startdate,
enddate=enddate,
dropdate=dropdate,
geo=geo,
parallel=params["parallel"],
weekday=weekday,
se=params["se"],
prefix=params["obfuscated_prefix"]
)
for sensorize in params["sensorize"]:
if weekday:
logging.info("starting %s, weekday adj", geo)
else:
logging.info("starting %s, no adj", geo)
update_sensor(
filepath=params["input_file"],
outpath=params["export_dir"],
staticpath=params["static_file_dir"],
startdate=startdate,
enddate=enddate,
dropdate=dropdate,
geo=geo,
parallel=params["parallel"],
weekday=weekday,
se=params["se"],
sensorize=sensorize,
global_sensor_fit=params["global_sensor_fit"],
prefix=params["obfuscated_prefix"]
)
logging.info("finished %s", geo)

logging.info("finished all")
2 changes: 1 addition & 1 deletion doctor_visits/delphi_doctor_visits/sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def fit(y_data,

Returns: dictionary of results
"""
y_data.set_index("ServiceDate", inplace=True)
y_data.set_index(Config.DATE_COL, inplace=True)
y_data = DoctorVisitsSensor.fill_dates(y_data, fit_dates)
sensor_idxs = np.where(y_data.index >= sensor_dates[0])[0]
n_dates = y_data.shape[0]
Expand Down
Loading