Switch to structed logger for ChangeHC

chinandrew · Ananya-Joshi · commit c7d7ce0aced7 · 2021-09-28T15:13:06.000-04:00
diff --git a/changehc/delphi_changehc/run.py b/changehc/delphi_changehc/run.py
@@ -173,7 +173,8 @@ def run_module(params: Dict[str, Dict[str, Any]]):
                     weekday,
                     numtype,
                     params["indicator"]["se"],
-                    params["indicator"]["wip_signal"]
+                    params["indicator"]["wip_signal"],
+                    logger
                 )
                 if numtype == "covid":
                     data = load_combined_data(file_dict["denom"],
diff --git a/changehc/delphi_changehc/sensor.py b/changehc/delphi_changehc/sensor.py
@@ -87,7 +87,7 @@ def backfill(
         return new_num, new_den
 
     @staticmethod
-    def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
+    def fit(y_data, first_sensor_date, geo_id, logger, num_col="num", den_col="den"):
         """Fitting routine.
 
         Args:
@@ -121,7 +121,7 @@ def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
         se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)')
         rate_data['se'] = se_valid
 
-        logging.debug("{0}: {1:.3f},[{2:.3f}]".format(
+        logger.debug("{0}: {1:.3f},[{2:.3f}]".format(
             geo_id, rate_data['rate'][-1], rate_data['se'][-1]
         ))
         return {"geo_id": geo_id,
diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py
@@ -20,7 +20,7 @@
 from .weekday import Weekday
 
 
-def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", start_date=None, end_date=None):
+def write_to_csv(df, geo_level, write_se, day_shift, out_name, logger, output_path=".", start_date=None, end_date=None):
     """Write sensor values to csv.
 
     Args:
@@ -47,15 +47,15 @@ def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".",
     assert df[suspicious_se_mask].empty, " se contains suspiciously large values"
     assert not df["se"].isna().any(), " se contains nan values"
     if write_se:
-        logging.info("========= WARNING: WRITING SEs TO {0} =========".format(out_name))
+        logger.info("========= WARNING: WRITING SEs TO {0} =========".format(out_name))
     else:
         df.loc[:, "se"] = np.nan
 
     assert not df["val"].isna().any(), " val contains nan values"
     suspicious_val_mask = df["val"].gt(90)
     if not df[suspicious_val_mask].empty:
         for geo in df.loc[suspicious_val_mask, "geo_id"]:
-            logging.warning("value suspiciously high, {0}: {1}".format(
+            logger.warning("value suspiciously high, {0}: {1}".format(
                 geo, out_name
             ))
 
@@ -68,10 +68,10 @@ def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".",
         sensor=out_name,
         write_empty_days=True
     )
-    logging.debug("wrote {0} rows for {1} {2}".format(
+    logger.debug("wrote {0} rows for {1} {2}".format(
         df.size, df["geo_id"].unique().size, geo_level
     ))
-    logging.debug("wrote files to {0}".format(output_path))
+    logger.debug("wrote files to {0}".format(output_path))
     return dates
 
 
@@ -87,7 +87,8 @@ def __init__(self,
                  weekday,
                  numtype,
                  se,
-                 wip_signal):
+                 wip_signal,
+                 logger):
         """Init Sensor Updator.
 
         Args:
@@ -100,7 +101,9 @@ def __init__(self,
             numtype: type of count data used, one of ["covid", "cli"]
             se: boolean to write out standard errors, if true, use an obfuscated name
             wip_signal: Prefix for WIP signals
+            logger: the structured logger
         """
+        self.logger = logger
         self.startdate, self.enddate, self.dropdate = [
             pd.to_datetime(t) for t in (startdate, enddate, dropdate)]
         # handle dates
@@ -149,7 +152,7 @@ def geo_reindex(self, data):
         geo = self.geo
         gmpr = GeoMapper()
         if geo not in {"county", "state", "msa", "hrr", "nation", "hhs"}:
-            logging.error("{0} is invalid, pick one of 'county', "
+            self.logger.error("{0} is invalid, pick one of 'county', "
                           "'state', 'msa', 'hrr', 'hss','nation'".format(geo))
             return False
         if geo == "county":
@@ -201,12 +204,12 @@ def update_sensor(self,
                 sub_data.reset_index(level=0,inplace=True)
                 if self.weekday:
                     sub_data = Weekday.calc_adjustment(wd_params, sub_data)
-                res = CHCSensor.fit(sub_data, self.burnindate, geo_id)
+                res = CHCSensor.fit(sub_data, self.burnindate, geo_id, self.logger)
                 res = pd.DataFrame(res).loc[final_sensor_idxs]
                 dfs.append(res)
         else:
             n_cpu = min(10, cpu_count())
-            logging.debug("starting pool with {0} workers".format(n_cpu))
+            self.logger.debug("starting pool with {0} workers".format(n_cpu))
             with Pool(n_cpu) as pool:
                 pool_results = []
                 for geo_id, sub_data in data_frame.groupby(level=0,as_index=False):
@@ -215,7 +218,7 @@ def update_sensor(self,
                         sub_data = Weekday.calc_adjustment(wd_params, sub_data)
                     pool_results.append(
                         pool.apply_async(
-                            CHCSensor.fit, args=(sub_data, self.burnindate, geo_id,),
+                            CHCSensor.fit, args=(sub_data, self.burnindate, geo_id, self.logger),
                         )
                     )
                 pool_results = [proc.get() for proc in pool_results]
@@ -244,7 +247,8 @@ def update_sensor(self,
                 write_se=self.se,
                 day_shift=Config.DAY_SHIFT,
                 out_name=signal,
-                output_path=output_path
+                output_path=output_path,
+                logger=self.logger
             )
             if len(dates) > 0:
                 stats.append((max(dates), len(dates)))
diff --git a/changehc/tests/test_sensor.py b/changehc/tests/test_sensor.py
@@ -1,4 +1,5 @@
 # standard
+import logging
 
 import numpy as np
 import numpy.random as nr
@@ -19,6 +20,7 @@
 COVID_FILEPATH = PARAMS["indicator"]["input_covid_file"]
 DENOM_FILEPATH = PARAMS["indicator"]["input_denom_file"]
 DROP_DATE = pd.to_datetime(PARAMS["indicator"]["drop_date"])
+TEST_LOGGER = logging.getLogger()
 
 class TestLoadData:
     combined_data = load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE,
@@ -56,7 +58,7 @@ def test_fit_fips(self):
         for fips in all_fips:
             sub_data = self.combined_data.loc[fips]
             sub_data = sub_data.reindex(date_range, fill_value=0)
-            res0 = CHCSensor.fit(sub_data, date_range[0], fips)
+            res0 = CHCSensor.fit(sub_data, date_range[0], fips, TEST_LOGGER)
 
             if np.isnan(res0["rate"]).all():
                 assert res0["incl"].sum() == 0
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -1,4 +1,5 @@
 # standard
+import logging
 from copy import deepcopy
 import os
 from os.path import join, exists
@@ -27,6 +28,7 @@
 DENOM_FILEPATH = PARAMS["indicator"]["input_denom_file"]
 DROP_DATE = pd.to_datetime(PARAMS["indicator"]["drop_date"])
 OUTPATH="test_data/"
+TEST_LOGGER = logging.getLogger()
 
 class TestCHCSensorUpdator:
     """Tests for updating the sensors."""
@@ -53,7 +55,8 @@ def test_shift_dates(self):
             self.weekday,
             self.numtype,
             self.se,
-            ""
+            "",
+            TEST_LOGGER
         )
         ## Test init
         assert su_inst.startdate.month == 2
@@ -77,7 +80,8 @@ def test_geo_reindex(self):
                 self.weekday,
                 self.numtype,
                 self.se,
-                ""
+                "",
+               TEST_LOGGER
             )
             su_inst.shift_dates()
             test_data = pd.DataFrame({
@@ -103,7 +107,8 @@ def test_update_sensor(self):
                 self.weekday,
                 self.numtype,
                 self.se,
-                ""
+                "",
+                TEST_LOGGER
             )
             # As of 3/3/21 (40c258a), this set of data has county outputting data, state and hhs not
             # outputting data, and nation outputting data, which is undesirable. Ideal behaviour
@@ -149,7 +154,8 @@ def test_write_to_csv_results(self):
             write_se=False,
             day_shift=CONFIG.DAY_SHIFT,
             out_name="name_of_signal",
-            output_path=td.name
+            output_path=td.name,
+            logger=TEST_LOGGER
         )
 
         # check outputs
@@ -203,7 +209,8 @@ def test_write_to_csv_with_se_results(self):
             write_se=True,
             day_shift=CONFIG.DAY_SHIFT,
             out_name="name_of_signal",
-            output_path=td.name
+            output_path=td.name,
+            logger=TEST_LOGGER
         )
 
         # check outputs
@@ -243,7 +250,8 @@ def test_write_to_csv_wrong_results(self):
                 write_se=False,
                 day_shift=CONFIG.DAY_SHIFT,
                 out_name="name_of_signal",
-                output_path=td.name
+                output_path=td.name,
+                logger=TEST_LOGGER
             )
 
         # nan se for included loc-date
@@ -258,7 +266,8 @@ def test_write_to_csv_wrong_results(self):
                 write_se=True,
                 day_shift=CONFIG.DAY_SHIFT,
                 out_name="name_of_signal",
-                output_path=td.name
+                output_path=td.name,
+                logger=TEST_LOGGER
             )
 
         # large se value
@@ -273,7 +282,8 @@ def test_write_to_csv_wrong_results(self):
                 write_se=True,
                 day_shift=CONFIG.DAY_SHIFT,
                 out_name="name_of_signal",
-                output_path=td.name
+                output_path=td.name,
+                logger=TEST_LOGGER
             )
 
         td.cleanup()