cmu-delphi
diff --git a/‎doctor_visits/delphi_doctor_visits/config.py
Lines changed: 8 additions & 22 deletions b/‎doctor_visits/delphi_doctor_visits/config.py
Lines changed: 8 additions & 22 deletions
diff --git a/‎doctor_visits/delphi_doctor_visits/run.py
Lines changed: 35 additions & 22 deletions b/‎doctor_visits/delphi_doctor_visits/run.py
Lines changed: 35 additions & 22 deletions
diff --git a/‎doctor_visits/delphi_doctor_visits/sensor.py
Lines changed: 43 additions & 19 deletions b/‎doctor_visits/delphi_doctor_visits/sensor.py
Lines changed: 43 additions & 19 deletions
@@ -3,7 +3,7 @@
 
 Author: Maria
 Created: 2020-04-16
-Last modified: 2020-05-12
+Last modified: 2020-06-17
 """
 
 from datetime import datetime, timedelta
@@ -14,11 +14,8 @@ class Config:
     """
 
     # dates
+    FIRST_DATA_DATE = datetime(2020, 1, 1)
     DAY_SHIFT = timedelta(days=1)  # shift dates forward for labeling purposes
-    # Feb 1 is when we start producing sensors
-    FIRST_SENSOR_DATE = datetime(2020, 2, 1) - DAY_SHIFT
-    # add burn-in sensor dates to calculate direction
-    BURN_IN_DATE = datetime(2020, 1, 29) - DAY_SHIFT
 
     # data columns
     CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
@@ -30,26 +27,15 @@ class Config:
     HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
     ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
     FILT_COLS = ID_COLS + COUNT_COLS
-    DTYPES = {
-        "ServiceDate": str,
-        "PatCountyFIPS": str,
-        "Denominator": int,
-        "Flu1": int,
-        "Covid_like": int,
-        "Flu_like": int,
-        "Mixed": int,
-        "PatAgeGroup": str,
-        "Pat HRR Name": str,
-        "Pat HRR ID": float,
-    }
+    DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
+              "Denominator": int, "Flu1": int,
+              "Covid_like": int, "Flu_like": int,
+              "Mixed": int, "PatAgeGroup": str,
+              "Pat HRR Name": str, "Pat HRR ID": float}
 
     SMOOTHER_BANDWIDTH = 100  # bandwidth for the linear left Gaussian filter
-    MIN_OBS = 2500  # number of total visits needed to produce a sensor
-    MAX_BACKFILL_WINDOW = (
-        7  # maximum number of days used to average a backfill correction
-    )
+    MAX_BACKFILL_WINDOW = 7  # maximum number of days used to average a backfill correction
     MIN_CUM_VISITS = 500  # need to observe at least 500 counts before averaging
     RECENT_LENGTH = 7  # number of days to sum over for sparsity threshold
     MIN_RECENT_VISITS = 100  # min numbers of visits needed to include estimate
     MIN_RECENT_OBS = 3  # minimum days needed to produce an estimate for latest time
-    assert MIN_OBS >= MIN_CUM_VISITS, "Backfill adjustment not guaranteed to work"
@@ -7,7 +7,7 @@
 
 # standard packages
 import logging
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 
 #  third party
@@ -23,46 +23,59 @@ def run_module():
 
     logging.basicConfig(level=logging.DEBUG)
 
-    ## start date will be Jan 1
-    logging.info("start date:\t%s", params["start_date"])
-
     ## get end date from input file
     # the filename is expected to be in the format:
     # "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
-    if params["end_date"] == "":
-        dropdate = str(
-            datetime.strptime(
-                Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
-            ).date()
+    if params["drop_date"] == "":
+        dropdate_dt = datetime.strptime(
+            Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
         )
     else:
-        dropdate = params["end_date"]
+        dropdate_dt = datetime.strptime(params["end_date"], "%Y-%m-%d")
+    dropdate = str(dropdate_dt.date())
 
-    logging.info("drop date:\t%s", dropdate)
+    # range of estimates to produce
+    n_backfill_days = params["n_backfill_days"] # produce estimates for n_backfill_days
+    n_waiting_days = params["n_waiting_days"]  # most recent n_waiting_days won't be est
+    enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
+    startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
+    enddate = str(enddate_dt.date())
+    startdate = str(startdate_dt.date())
+    logging.info(f"drop date:\t\t{dropdate}")
+    logging.info(f"first sensor date:\t{startdate}")
+    logging.info(f"last sensor date:\t{enddate}")
+    logging.info(f"n_backfill_days:\t{n_backfill_days}")
+    logging.info(f"n_waiting_days:\t{n_waiting_days}")
 
     ## geographies
     geos = ["state", "msa", "hrr", "county"]
 
     ## print out other vars
-    logging.info("outpath:\t%s", params["export_dir"])
-    logging.info("parallel:\t%s", params["parallel"])
+    logging.info("outpath:\t\t%s", params["export_dir"])
+    logging.info("parallel:\t\t%s", params["parallel"])
+    logging.info(f"weekday:\t\t%s", params["weekday"])
+    logging.info(f"write se:\t\t%s", params["se"])
+    logging.info(f"obfuscated prefix:\t%s", params["obfuscated_prefix"])
 
     ## start generating
     for geo in geos:
-        for weekday in [True, False]:
+        for weekday in params["weekday"]:
             if weekday:
                 logging.info("starting %s, weekday adj", geo)
             else:
                 logging.info("starting %s, no adj", geo)
             update_sensor(
-                params["input_file"],
-                params["export_dir"],
-                params["static_file_dir"],
-                params["start_date"],
-                dropdate,
-                geo,
-                params["parallel"],
-                weekday,
+                filepath=params["input_file"],
+                outpath=params["export_dir"],
+                staticpath=params["static_file_dir"],
+                startdate=startdate,
+                enddate=enddate,
+                dropdate=dropdate,
+                geo=geo,
+                parallel=params["parallel"],
+                weekday=weekday,
+                se=params["se"],
+                prefix=params["obfuscated_prefix"]
             )
         logging.info("finished %s", geo)
 
 
@@ -25,7 +25,7 @@ class DoctorVisitsSensor:
 
     @staticmethod
     def transform(
-        sig, h=Config.SMOOTHER_BANDWIDTH, smoother=left_gauss_linear, base=None
+            sig, h=Config.SMOOTHER_BANDWIDTH, smoother=left_gauss_linear, base=None
     ):
         """Transform signal by applying a smoother, and/or adjusting by a base.
 
@@ -80,12 +80,12 @@ def fill_dates(y_data, dates):
 
     @staticmethod
     def backfill(
-        num,
-        den,
-        k=Config.MAX_BACKFILL_WINDOW,
-        min_visits_to_fill=Config.MIN_CUM_VISITS,
-        min_visits_to_include=Config.MIN_RECENT_VISITS,
-        min_recent_obs_to_include=Config.MIN_RECENT_OBS,
+            num,
+            den,
+            k=Config.MAX_BACKFILL_WINDOW,
+            min_visits_to_fill=Config.MIN_CUM_VISITS,
+            min_visits_to_include=Config.MIN_RECENT_VISITS,
+            min_recent_obs_to_include=Config.MIN_RECENT_OBS,
     ):
         """
         Adjust for backfill (retroactively added observations) by using a
@@ -129,17 +129,17 @@ def backfill(
                 for j in range(p):
                     new_num[i, j] = revnum[i, j]
             else:
-                den_bin = revden[i : (i + closest_fill_day + 1)]
+                den_bin = revden[i: (i + closest_fill_day + 1)]
                 new_den[i] = den_bin.sum()
 
                 for j in range(p):
-                    num_bin = revnum[i : (i + closest_fill_day + 1), j]
+                    num_bin = revnum[i: (i + closest_fill_day + 1), j]
                     new_num[i, j] = num_bin.sum()
 
             # if we do not observe at least min_visits_to_include in the denominator or
             # if we observe 0 counts for min_recent_obs window, don't show.
             if (new_den[i] < min_visits_to_include) or (
-                revden[i:][:min_recent_obs_to_include].sum() == 0
+                    revden[i:][:min_recent_obs_to_include].sum() == 0
             ):
                 include[i] = False
 
@@ -156,7 +156,13 @@ def backfill(
         return new_num, new_den, include
 
     @staticmethod
-    def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_obs):
+    def fit(y_data,
+            fit_dates,
+            sensor_dates,
+            geo_id,
+            recent_min_visits,
+            min_recent_obs,
+            jeffreys):
         """Fitting routine.
 
         Args:
@@ -168,6 +174,9 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
                                                 <RECENT_LENGTH> days
             min_recent_obs: location is sparse also if it has 0 observations in the
                                             last min_recent_obs days
+            jeffreys: boolean whether to use Jeffreys estimate for binomial proportion, this
+                is currently only applied if we are writing SEs out. The estimate is
+                p_hat = (x + 0.5)/(n + 1).
 
         Returns: dictionary of results
         """
@@ -176,28 +185,43 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
         sensor_idxs = np.where(y_data.index >= sensor_dates[0])[0]
         n_dates = y_data.shape[0]
 
+        # combine Flu_like and Mixed columns
+        y_data["Flu_like_Mixed"] = y_data["Flu_like"] + y_data["Mixed"]
+        NEW_CLI_COLS = list(set(Config.CLI_COLS) - {"Flu_like", "Mixed"}) + [
+            "Flu_like_Mixed"]
+
+        # small backfill correction
         total_visits = y_data["Denominator"]
-        total_counts = y_data[Config.CLI_COLS + Config.FLU1_COL]
+        total_counts = y_data[NEW_CLI_COLS + Config.FLU1_COL]
         total_counts, total_visits, include = DoctorVisitsSensor.backfill(
             total_counts,
             total_visits,
             min_visits_to_include=recent_min_visits,
-            min_recent_obs_to_include=min_recent_obs,
+            min_recent_obs_to_include=min_recent_obs
         )
-        total_rates = total_counts.div(total_visits, axis=0)
+
+        # jeffreys inflation
+        if jeffreys:
+            total_counts[NEW_CLI_COLS] = total_counts[NEW_CLI_COLS] + 0.5
+            total_rates = total_counts.div(total_visits + 1, axis=0)
+        else:
+            total_rates = total_counts.div(total_visits, axis=0)
+
         total_rates.fillna(0, inplace=True)
         flu1 = total_rates[Config.FLU1_COL]
         new_rates = []
-        for code in Config.CLI_COLS:
+        for code in NEW_CLI_COLS:
             code_vals = total_rates[code]
 
             # if all rates are zero, don't bother
             if code_vals.sum() == 0:
+                if jeffreys:
+                    logging.error("p is 0 even though we used Jefferys estimate")
                 new_rates.append(np.zeros((n_dates,)))
                 continue
 
             # include adjustment for flu like codes
-            base = flu1 if code in ["Flu_like", "Mixed"] else None
+            base = flu1 if code in ["Flu_like_Mixed"] else None
             fitted_codes = DoctorVisitsSensor.transform(
                 code_vals.values.reshape(-1, 1), base=base
             )
@@ -211,9 +235,9 @@ def fit(y_data, fit_dates, sensor_dates, geo_id, recent_min_visits, min_recent_o
         den = total_visits[sensor_idxs].values
 
         # calculate standard error
-        mask = den < 1
-        se = np.sqrt(np.divide((new_rates * (1 - new_rates)), den, where=den != 0))
-        se[mask] = np.nan  # handle case where we observe no visits
+        se = np.full_like(new_rates, np.nan)
+        se[include] = np.sqrt(
+            np.divide((new_rates[include] * (1 - new_rates[include])), den[include]))
 
         logging.debug(f"{geo_id}: {new_rates[-1]:.3f},[{se[-1]:.3f}]")
         return {"geo_id": geo_id, "rate": new_rates, "se": se, "incl": include}