cmu-delphi
diff --git a/‎Jenkinsfile
Lines changed: 1 addition & 2 deletions b/‎Jenkinsfile
Lines changed: 1 addition & 2 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/export.py
Lines changed: 11 additions & 5 deletions b/‎_delphi_utils_python/delphi_utils/export.py
Lines changed: 11 additions & 5 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/validator/run.py
Lines changed: 2 additions & 2 deletions b/‎_delphi_utils_python/delphi_utils/validator/run.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎ansible/google_health-build.yaml
Lines changed: 0 additions & 15 deletions b/‎ansible/google_health-build.yaml
Lines changed: 0 additions & 15 deletions
diff --git a/‎ansible/templates/google_health-params-prod.json.j2
Lines changed: 0 additions & 23 deletions b/‎ansible/templates/google_health-params-prod.json.j2
Lines changed: 0 additions & 23 deletions
diff --git a/‎ansible/templates/google_health-params-test.json.j2
Lines changed: 0 additions & 22 deletions b/‎ansible/templates/google_health-params-test.json.j2
Lines changed: 0 additions & 22 deletions
diff --git a/‎changehc/delphi_changehc/update_sensor.py
Lines changed: 69 additions & 66 deletions b/‎changehc/delphi_changehc/update_sensor.py
Lines changed: 69 additions & 66 deletions
@@ -9,7 +9,7 @@
    - Keep in sync with '.github/workflows/python-ci.yml'.
    - TODO: #527 Get this list automatically from python-ci.yml at runtime.
  */
-def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "covid_act_now", "google_health", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "sir_complainsalot", "usafacts"]
+def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "covid_act_now", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "sir_complainsalot", "usafacts"]
 def build_package = [:]
 def deploy_staging = [:]
 def deploy_production = [:]
@@ -45,7 +45,6 @@ pipeline {
                     }
                     parallel deploy_staging
                 }
-                sh "jenkins/deploy-staging-api-match-list.sh"
             }
         }
         stage('Deploy production') {
 
@@ -15,7 +15,8 @@ def create_export_csv(
     metric: Optional[str] = None,
     start_date: Optional[datetime] = None,
     end_date: Optional[datetime] = None,
-    remove_null_samples: Optional[bool] = False
+    remove_null_samples: Optional[bool] = False,
+    write_empty_days: Optional[bool] = False
 ):
     """Export data in the format expected by the Delphi API.
 
@@ -39,6 +40,9 @@ def create_export_csv(
         Latest date to export or None if no maximum date restrictions should be applied.
     remove_null_samples: Optional[bool]
         Whether to remove entries whose sample sizes are null.
+    write_empty_days: Optional[bool]
+        If true, every day in between start_date and end_date will have a CSV file written
+        even if there is no data for the day. If false, only the days present are written.
 
     Returns
     ---------
@@ -52,11 +56,13 @@ def create_export_csv(
         start_date = min(df["timestamp"])
     if end_date is None:
         end_date = max(df["timestamp"])
-
-    dates = pd.Series(
-        df[np.logical_and(df["timestamp"] >= start_date,
+    if not write_empty_days:
+        dates = pd.Series(
+            df[np.logical_and(df["timestamp"] >= start_date,
                           df["timestamp"] <= end_date)]["timestamp"].unique()
-    ).sort_values()
+        ).sort_values()
+    else:
+        dates = pd.date_range(start_date, end_date)
 
     for date in dates:
         if metric is None:
 
@@ -19,8 +19,8 @@ def run_module():
     validator = Validator(params)
     validator.validate().print_and_exit(
         get_structured_logger(__name__,
-                              params["common"].get("log_filename", None),
-                              not args.dry_run))
+                              params["common"].get("log_filename", None)),
+        not args.dry_run)
 
 
 def validator_from_params(params):
 
@@ -11,7 +11,7 @@
 # third party
 import numpy as np
 import pandas as pd
-from delphi_utils import GeoMapper, read_params, add_prefix
+from delphi_utils import GeoMapper, add_prefix, create_export_csv
 
 # first party
 from .config import Config
@@ -20,57 +20,58 @@
 from .weekday import Weekday
 
 
-def write_to_csv(output_dict, write_se, out_name, output_path="."):
+def write_to_csv(df, geo_level, write_se, day_shift, out_name, output_path=".", start_date=None, end_date=None):
     """Write sensor values to csv.
 
     Args:
-        output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
+        df: dataframe containing unique timestamp, unqiue geo_id, val, se, sample_size
+        geo_level: the geographic level being written e.g. county, state
         write_se: boolean to write out standard errors, if true, use an obfuscated name
+        day_shift: a timedelta specifying the time shift to apply to the dates
         out_name: name of the output file
         output_path: outfile path to write the csv (default is current directory)
+        start_date: the first date of the dates to be written
+        end_date: the last date of the dates to be written
     """
+    df = df.copy()
+
+    # shift dates forward for labeling
+    df["timestamp"] += day_shift
+    if start_date is None:
+        start_date = min(df["timestamp"])
+    if end_date is None:
+        end_date = max(df["timestamp"])
+
+    # suspicious value warnings
+    suspicious_se_mask = df["se"].gt(5)
+    assert df[suspicious_se_mask].empty, " se contains suspiciously large values"
+    assert not df["se"].isna().any(), " se contains nan values"
     if write_se:
         logging.info("========= WARNING: WRITING SEs TO {0} =========".format(out_name))
-    geo_level = output_dict["geo_level"]
-    dates = output_dict["dates"]
-    geo_ids = output_dict["geo_ids"]
-    all_rates = output_dict["rates"]
-    all_se = output_dict["se"]
-    all_include = output_dict["include"]
-    out_n = 0
-    for i, d in enumerate(dates):
-        filename = "%s/%s_%s_%s.csv" % (
-            output_path,
-            (d + Config.DAY_SHIFT).strftime("%Y%m%d"),
-            geo_level,
-            out_name,
-        )
-        with open(filename, "w") as outfile:
-            outfile.write("geo_id,val,se,direction,sample_size\n")
-            for geo_id in geo_ids:
-                sensor = all_rates[geo_id][i]
-                se = all_se[geo_id][i]
-                if all_include[geo_id][i]:
-                    assert not np.isnan(sensor), "value for included sensor is nan"
-                    assert not np.isnan(se), "se for included sensor is nan"
-                    if sensor > 90:
-                        logging.warning("value suspiciously high, {0}: {1}".format(
-                            geo_id, sensor
-                        ))
-                    assert se < 5, f"se suspiciously high, {geo_id}: {se}"
-                    if write_se:
-                        assert sensor > 0 and se > 0, "p=0, std_err=0 invalid"
-                        outfile.write(
-                            "%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA))
-                    else:
-                        # for privacy reasons we will not report the standard error
-                        outfile.write(
-                            "%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA)
-                        )
-                    out_n += 1
+    else:
+        df.loc[:, "se"] = np.nan
+
+    assert not df["val"].isna().any(), " val contains nan values"
+    suspicious_val_mask = df["val"].gt(90)
+    if not df[suspicious_val_mask].empty:
+        for geo in df.loc[suspicious_val_mask, "geo_id"]:
+            logging.warning("value suspiciously high, {0}: {1}".format(
+                geo, out_name
+            ))
+
+    create_export_csv(
+        df,
+        export_dir=output_path,
+        geo_res=geo_level,
+        start_date=start_date,
+        end_date=end_date,
+        sensor=out_name,
+        write_empty_days=True
+    )
     logging.debug("wrote {0} rows for {1} {2}".format(
-        out_n, len(geo_ids), geo_level
+        df.size, df["geo_id"].unique().size, geo_level
     ))
+    logging.debug("wrote files to {0}".format(output_path))
 
 
 class CHCSensorUpdator:  # pylint: disable=too-many-instance-attributes
@@ -176,12 +177,12 @@ def geo_reindex(self, data):
 
     def update_sensor(self,
             data,
-            outpath):
+            output_path):
         """Generate sensor values, and write to csv format.
 
         Args:
             data: pd.DataFrame with columns num and den
-            outpath: output path for the csv results
+            output_path: output path for the csv results
         """
         self.shift_dates()
         final_sensor_idxs = (self.burn_in_dates >= self.startdate) &\
@@ -193,19 +194,15 @@ def update_sensor(self,
         # handle if we need to adjust by weekday
         wd_params = Weekday.get_params(data_frame) if self.weekday else None
         # run sensor fitting code (maybe in parallel)
-        sensor_rates = {}
-        sensor_se = {}
-        sensor_include = {}
         if not self.parallel:
+            dfs = []
             for geo_id, sub_data in data_frame.groupby(level=0):
                 sub_data.reset_index(level=0,inplace=True)
                 if self.weekday:
                     sub_data = Weekday.calc_adjustment(wd_params, sub_data)
                 res = CHCSensor.fit(sub_data, self.burnindate, geo_id)
-                res = pd.DataFrame(res)
-                sensor_rates[geo_id] = np.array(res.loc[final_sensor_idxs,"rate"])
-                sensor_se[geo_id] = np.array(res.loc[final_sensor_idxs,"se"])
-                sensor_include[geo_id] = np.array(res.loc[final_sensor_idxs,"incl"])
+                res = pd.DataFrame(res).loc[final_sensor_idxs]
+                dfs.append(res)
         else:
             n_cpu = min(10, cpu_count())
             logging.debug("starting pool with {0} workers".format(n_cpu))
@@ -221,23 +218,29 @@ def update_sensor(self,
                         )
                     )
                 pool_results = [proc.get() for proc in pool_results]
+                dfs = []
                 for res in pool_results:
-                    geo_id = res["geo_id"]
-                    res = pd.DataFrame(res)
-                    sensor_rates[geo_id] = np.array(res.loc[final_sensor_idxs, "rate"])
-                    sensor_se[geo_id] = np.array(res.loc[final_sensor_idxs, "se"])
-                    sensor_include[geo_id] = np.array(res.loc[final_sensor_idxs, "incl"])
-        unique_geo_ids = list(sensor_rates.keys())
-        output_dict = {
-            "rates": sensor_rates,
-            "se": sensor_se,
-            "dates": self.sensor_dates,
-            "geo_ids": unique_geo_ids,
-            "geo_level": self.geo,
-            "include": sensor_include,
-        }
+                    res = pd.DataFrame(res).loc[final_sensor_idxs]
+                    dfs.append(res)
+
+        # Form the output dataframe
+        df = pd.concat(dfs)
+        # sample size is never shared
+        df["sample_size"] = np.nan
+        # conform to naming expected by create_export_csv()
+        df = df.reset_index().rename(columns={"date": "timestamp", "rate": "val"})
+        # df.loc[~df['incl'], ["val", "se"]] = np.nan  # update to this line after nancodes get merged in
+        df = df[df['incl']]
 
         # write out results
         for signal in self.updated_signal_names:
-            write_to_csv(output_dict, self.se, signal, outpath)
-        logging.debug("wrote files to {0}".format(outpath))
+            write_to_csv(
+                df,
+                geo_level=self.geo,
+                start_date=min(self.sensor_dates),
+                end_date=max(self.sensor_dates),
+                write_se=self.se,
+                day_shift=Config.DAY_SHIFT,
+                out_name=signal,
+                output_path=output_path
+            )
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`- Keep in sync with '.github/workflows/python-ci.yml'.`
`10`	`10`	`- TODO: #527 Get this list automatically from python-ci.yml at runtime.`
`11`	`11`	`*/`
`12`		`-def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "covid_act_now", "google_health", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "sir_complainsalot", "usafacts"]`
	`12`	`+def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "covid_act_now", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "sir_complainsalot", "usafacts"]`
`13`	`13`	`def build_package = [:]`
`14`	`14`	`def deploy_staging = [:]`
`15`	`15`	`def deploy_production = [:]`
`@@ -45,7 +45,6 @@ pipeline {`
`45`	`45`	`}`
`46`	`46`	`parallel deploy_staging`
`47`	`47`	`}`
`48`		`- sh "jenkins/deploy-staging-api-match-list.sh"`
`49`	`48`	`}`
`50`	`49`	`}`
`51`	`50`	`stage('Deploy production') {`