cmu-delphi · rumackaaron · Nov 17, 2020 · Nov 17, 2020 · Nov 19, 2020 · Nov 20, 2020
diff --git a/doctor_visits/delphi_doctor_visits/config.py b/doctor_visits/delphi_doctor_visits/config.py
@@ -21,21 +21,24 @@ class Config:
     CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
     FLU1_COL = ["Flu1"]
     COUNT_COLS = CLI_COLS + FLU1_COL + ["Denominator"]
-    DATE_COL = "ServiceDate"
-    GEO_COL = "PatCountyFIPS"
-    AGE_COL = "PatAgeGroup"
-    HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
+    DATE_COL = "ServiceDate" #"servicedate"
+    GEO_COL = "PatCountyFIPS" #"patCountyFIPS"
+    AGE_COL = "PatAgeGroup" #"patAgeGroup"
+    HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]#["patHRRname", "patHRRid"]
     ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
     FILT_COLS = ID_COLS + COUNT_COLS
-    DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
+    DTYPES = {DATE_COL: str, GEO_COL: str,
               "Denominator": int, "Flu1": int,
               "Covid_like": int, "Flu_like": int,
-              "Mixed": int, "PatAgeGroup": str,
-              "Pat HRR Name": str, "Pat HRR ID": float}
+              "Mixed": int, AGE_COL: str,
+              HRR_COLS[0]: str, HRR_COLS[1]: float}
 
     SMOOTHER_BANDWIDTH = 100  # bandwidth for the linear left Gaussian filter
     MAX_BACKFILL_WINDOW = 7  # maximum number of days used to average a backfill correction
     MIN_CUM_VISITS = 500  # need to observe at least 500 counts before averaging
     RECENT_LENGTH = 7  # number of days to sum over for sparsity threshold
     MIN_RECENT_VISITS = 100  # min numbers of visits needed to include estimate
     MIN_RECENT_OBS = 3  # minimum days needed to produce an estimate for latest time
+
+    SENSOR_WINDOW_START = None # 7 # start of training window for sensorization
+    SENSOR_WINDOW_END = 42 # end of training window for sensorization
diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py
@@ -40,10 +40,10 @@ def county_to_msa(self, data):
         data = self.gmpr.add_geocode(data,
                                      "fips",
                                      "msa",
-                                     from_col="PatCountyFIPS",
+                                     from_col=Config.GEO_COL,
                                      new_col="cbsa_id")
-        data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()
+        data.drop(columns=Config.GEO_COL, inplace=True)
+        data = data.groupby([Config.DATE_COL, "cbsa_id"]).sum().reset_index()
 
         return data.groupby("cbsa_id"), "cbsa_id"
 
@@ -58,9 +58,9 @@ def county_to_state(self, data):
         data = self.gmpr.add_geocode(data,
                                      "fips",
                                      "state_id",
-                                     from_col="PatCountyFIPS")
-        data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()
+                                     from_col=Config.GEO_COL)
+        data.drop(columns=Config.GEO_COL, inplace=True)
+        data = data.groupby([Config.DATE_COL, "state_id"]).sum().reset_index()
 
         return data.groupby("state_id"), "state_id"
 
@@ -81,11 +81,11 @@ def county_to_hrr(self, data):
         data = self.gmpr.add_geocode(data,
                                      "fips",
                                      "hrr",
-                                     from_col="PatCountyFIPS")
-        data.drop(columns="PatCountyFIPS", inplace=True)
+                                     from_col=Config.GEO_COL)
+        data.drop(columns=Config.GEO_COL, inplace=True)
 
         ## do a weighted sum by the wpop column to get each HRR's contribution
-        tmp = data.groupby(["ServiceDate", "hrr"])
+        tmp = data.groupby([Config.DATE_COL, "hrr"])
         wtsum = lambda g: g["weight"].values @ g[Config.COUNT_COLS]
         data = tmp.apply(wtsum).reset_index()
 
@@ -101,14 +101,15 @@ def county_to_megacounty(self, data, threshold_visits, threshold_len):
 
         Returns: tuple of dataframe at the daily-state resolution, and geo_id column name
         """
+
         all_data = self.gmpr.fips_to_megacounty(data,
                                             threshold_visits,
                                             threshold_len,
-                                            fips_col="PatCountyFIPS",
+                                            fips_col=Config.GEO_COL,
                                             thr_col="Denominator",
-                                            date_col="ServiceDate")
-        all_data.rename({"megafips": "PatCountyFIPS"}, axis=1, inplace=True)
-        megacounties = all_data[all_data.PatCountyFIPS.str.endswith("000")]
+                                            date_col=Config.DATE_COL)
+        all_data.rename({"megafips": Config.GEO_COL}, axis=1, inplace=True)
+        megacounties = all_data[all_data[Config.GEO_COL].str.endswith("000")]
         data = pd.concat([data, megacounties])
 
-        return data.groupby("PatCountyFIPS"), "PatCountyFIPS"
+        return data.groupby(Config.GEO_COL), Config.GEO_COL
diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py
@@ -15,6 +15,7 @@
 
 # first party
 from .update_sensor import update_sensor
+from .config import Config
 
 
 def run_module():
@@ -25,10 +26,10 @@ def run_module():
 
     ## get end date from input file
     # the filename is expected to be in the format:
-    # "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
+    # "EDI_AGG_OUTPATIENT_YYYYMMDD_HHMM{timezone}.csv.gz"
     if params["drop_date"] == "":
         dropdate_dt = datetime.strptime(
-            Path(params["input_file"]).name.split("_")[3], "%d%m%Y"
+            Path(params["input_file"]).name.split("_")[3], "%Y%m%d"
         )
     else:
         dropdate_dt = datetime.strptime(params["drop_date"], "%Y-%m-%d")
@@ -38,7 +39,8 @@ def run_module():
     n_backfill_days = params["n_backfill_days"] # produce estimates for n_backfill_days
     n_waiting_days = params["n_waiting_days"]  # most recent n_waiting_days won't be est
     enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
-    startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
+    startdate_dt = max(enddate_dt - timedelta(days=n_backfill_days),\
+                        Config.FIRST_DATA_DATE + timedelta(days=1) + Config.DAY_SHIFT)
     enddate = str(enddate_dt.date())
     startdate = str(startdate_dt.date())
     logging.info(f"drop date:\t\t{dropdate}")
@@ -53,30 +55,33 @@ def run_module():
     ## print out other vars
     logging.info("outpath:\t\t%s", params["export_dir"])
     logging.info("parallel:\t\t%s", params["parallel"])
-    logging.info(f"weekday:\t\t%s", params["weekday"])
-    logging.info(f"write se:\t\t%s", params["se"])
-    logging.info(f"obfuscated prefix:\t%s", params["obfuscated_prefix"])
+    logging.info("weekday:\t\t%s", params["weekday"])
+    logging.info("write se:\t\t%s", params["se"])
+    logging.info("obfuscated prefix:\t%s", params["obfuscated_prefix"])
 
     ## start generating
     for geo in geos:
         for weekday in params["weekday"]:
-            if weekday:
-                logging.info("starting %s, weekday adj", geo)
-            else:
-                logging.info("starting %s, no adj", geo)
-            update_sensor(
-                filepath=params["input_file"],
-                outpath=params["export_dir"],
-                staticpath=params["static_file_dir"],
-                startdate=startdate,
-                enddate=enddate,
-                dropdate=dropdate,
-                geo=geo,
-                parallel=params["parallel"],
-                weekday=weekday,
-                se=params["se"],
-                prefix=params["obfuscated_prefix"]
-            )
+            for sensorize in params["sensorize"]:
+                if weekday:
+                    logging.info("starting %s, weekday adj", geo)
+                else:
+                    logging.info("starting %s, no adj", geo)
+                update_sensor(
+                    filepath=params["input_file"],
+                    outpath=params["export_dir"],
+                    staticpath=params["static_file_dir"],
+                    startdate=startdate,
+                    enddate=enddate,
+                    dropdate=dropdate,
+                    geo=geo,
+                    parallel=params["parallel"],
+                    weekday=weekday,
+                    se=params["se"],
+                    sensorize=sensorize,
+                    global_sensor_fit=params["global_sensor_fit"],
+                    prefix=params["obfuscated_prefix"]
+                )
         logging.info("finished %s", geo)
 
     logging.info("finished all")
diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py
@@ -180,7 +180,7 @@ def fit(y_data,
 
         Returns: dictionary of results
         """
-        y_data.set_index("ServiceDate", inplace=True)
+        y_data.set_index(Config.DATE_COL, inplace=True)
         y_data = DoctorVisitsSensor.fill_dates(y_data, fit_dates)
         sensor_idxs = np.where(y_data.index >= sensor_dates[0])[0]
         n_dates = y_data.shape[0]