NAN codes for JHU:

dshemetov · dshemetov · commit d64dc068af20 · 2021-04-06T15:21:04.000-07:00
* keep nan values, add missing columns, add missing today, add tests
diff --git a/_delphi_utils_python/delphi_utils/nancodes.py b/_delphi_utils_python/delphi_utils/nancodes.py
@@ -1,7 +1,7 @@
 """Provides unified not-a-number codes for the indicators.
 
 Currently requires a manual sync between the covidcast-indicators
-and the delphi-epidata repo. 
+and the delphi-epidata repo.
 * in covidcast-indicators: _delphi_utils_python/delphi_utils
 * in delphi-epidata: src/acquisition/covidcast
 """
diff --git a/jhu/delphi_jhu/run.py b/jhu/delphi_jhu/run.py
@@ -9,13 +9,15 @@
 import time
 from typing import Dict, Any
 
+import pandas as pd
 import numpy as np
 from delphi_utils import (
     create_export_csv,
     S3ArchiveDiffer,
     Smoother,
     GeoMapper,
     get_structured_logger,
+    Nans,
 )
 
 from .geo import geo_map
@@ -63,6 +65,64 @@
 ]
 
 
+def add_nancodes(df, metric, geo_res, smoother):
+    """Add nancodes to the dataframe."""
+    idx = pd.IndexSlice
+
+    # Default missingness codes
+    df["missing_val"] = Nans.NOT_MISSING
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark early smoothing entries as data insufficient
+    if smoother == "seven_day_average":
+        df.sort_index(inplace=True)
+        min_time_value = df.index.min()[0] + 5 * pd.Timedelta(days=1)
+        df.loc[idx[:min_time_value, :], "missing_val"] = Nans.PRIVACY
+
+    # Mark Puerto Rico county deaths with a region exception code
+    # Search "Puerto Rico" here for details:
+    # https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data
+    if metric == "deaths" and geo_res == "county":
+        puerto_rico_fips = ["72" + str(i).zfill(3) for i in range(1, 155)]
+        df.loc[idx[:, puerto_rico_fips], "missing_val"] = Nans.REGION_EXCEPTION
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = df["val"].isnull() & df["missing_val"].eq(Nans.NOT_MISSING)
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
+    return df
+
+def add_missing_current_day(df):
+    """Add missing entry for today if geo had a value previously."""
+    df = df.copy().reset_index()
+    today = pd.Timestamp.today().date()
+    missing_today_mask = lambda x: not any(pd.to_datetime(x).dt.date == today)
+    has_prior_data_mask = lambda x: any(pd.to_datetime(x).dt.date < today)
+    expected_geos = set(
+        df.groupby(["geo_id"])["timestamp"]
+          .agg(
+            mask1=missing_today_mask,
+            mask2=has_prior_data_mask
+          )
+          .query("mask1 == True and mask2 == True")
+          .index
+          .values
+    )
+    new_rows = pd.DataFrame({
+        "timestamp": [pd.to_datetime(today)] * len(expected_geos),
+        "geo_id": list(expected_geos),
+        "new_counts": [np.nan] * len(expected_geos),
+        "cumulative_counts": [np.nan] * len(expected_geos),
+        "population": [np.nan] * len(expected_geos),
+        "incidence": [np.nan] * len(expected_geos),
+        "cumulative_prop": [np.nan] * len(expected_geos),
+        "val": [np.nan] * len(expected_geos),
+        "se": [np.nan] * len(expected_geos),
+        "sample_size": [np.nan] * len(expected_geos),
+    })
+    df = df.append(new_rows).set_index(["timestamp", "geo_id"])
+    return df
+
 def run_module(params: Dict[str, Any]):
     """Run the JHU indicator module.
 
@@ -86,8 +146,10 @@ def run_module(params: Dict[str, Any]):
     export_dir = params["common"]["export_dir"]
     base_url = params["indicator"]["base_url"]
     logger = get_structured_logger(
-        __name__, filename=params["common"].get("log_filename"),
-        log_exceptions=params["common"].get("log_exceptions", True))
+        __name__,
+        filename=params["common"].get("log_filename"),
+        log_exceptions=params["common"].get("log_exceptions", True),
+    )
 
     if "archive" in params:
         arch_diff = S3ArchiveDiffer(
@@ -112,16 +174,23 @@ def run_module(params: Dict[str, Any]):
             metric=metric,
             geo_res=geo_res,
             sensor=sensor,
-            smoother=smoother)
+            smoother=smoother,
+        )
         df = dfs[metric]
         # Aggregate to appropriate geographic resolution
         df = geo_map(df, geo_res, sensor)
         df.set_index(["timestamp", "geo_id"], inplace=True)
+
+        # Smooth
         df["val"] = df[sensor].groupby(level=1).transform(SMOOTHERS_MAP[smoother][0])
+
+        # JHU is not a survey data source
         df["se"] = np.nan
         df["sample_size"] = np.nan
-        # Drop early entries where data insufficient for smoothing
-        df = df[~df["val"].isnull()]
+
+        df = add_missing_current_day(df)
+        df = add_nancodes(df, metric, geo_res, smoother)
+
         df = df.reset_index()
         sensor_name = SENSOR_NAME_MAP[sensor][0]
         # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
@@ -141,7 +210,8 @@ def run_module(params: Dict[str, Any]):
             if not oldest_final_export_date:
                 oldest_final_export_date = max(exported_csv_dates)
             oldest_final_export_date = min(
-                oldest_final_export_date, max(exported_csv_dates))
+                oldest_final_export_date, max(exported_csv_dates)
+            )
 
     if arch_diff is not None:
         # Diff exports, and make incremental versions
@@ -167,9 +237,13 @@ def run_module(params: Dict[str, Any]):
     formatted_oldest_final_export_date = None
     if oldest_final_export_date:
         max_lag_in_days = (datetime.now() - oldest_final_export_date).days
-        formatted_oldest_final_export_date = oldest_final_export_date.strftime("%Y-%m-%d")
-    logger.info("Completed indicator run",
-        elapsed_time_in_seconds = elapsed_time_in_seconds,
-        csv_export_count = csv_export_count,
-        max_lag_in_days = max_lag_in_days,
-        oldest_final_export_date = formatted_oldest_final_export_date)
+        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
+            "%Y-%m-%d"
+        )
+    logger.info(
+        "Completed indicator run",
+        elapsed_time_in_seconds=elapsed_time_in_seconds,
+        csv_export_count=csv_export_count,
+        max_lag_in_days=max_lag_in_days,
+        oldest_final_export_date=formatted_oldest_final_export_date,
+    )
diff --git a/jhu/tests/test_run.py b/jhu/tests/test_run.py
@@ -2,12 +2,19 @@
 from os.path import join, basename
 
 import pandas as pd
+import numpy as np
+from delphi_jhu.run import add_nancodes, add_missing_current_day
+from delphi_utils import Nans
 
+def _non_ignored_files_set(directory):
+    """List all files in a directory not preceded by a '.' and store them in a set."""
+    out = {fname for fname in listdir(directory) if not basename(fname).startswith(".")}
+    return out
 
 class TestRun:
     def test_output_files_exist(self, run_as_module):
 
-        csv_files = [x for x in listdir("receiving") if not basename(x).startswith(".")]
+        csv_files = _non_ignored_files_set("receiving")
 
         dates = [
             "20200303",
@@ -18,28 +25,115 @@ def test_output_files_exist(self, run_as_module):
             "20200308",
             "20200309",
             "20200310",
+            pd.Timestamp.today().date().strftime("%Y%m%d")
         ]
         geos = ["county", "hrr", "msa", "state", "hhs", "nation"]
-        metrics = []
-        for event in ["confirmed", "deaths"]:
-            for smoothing in ["", "_7dav"]:
-                for window in ["incidence", "cumulative"]:
-                    for stat in ["num", "prop"]:
-                        metrics.append(f"{event}{smoothing}_{window}_{stat}")
-
-        expected_files = []
-        for date in dates:
-            for geo in geos:
-                for metric in metrics:
-                    # Can't compute 7dav for first few days of data because of NAs
-                    if date > "20200305" or "7dav" not in metric:
-                        expected_files += [date + "_" + geo + "_" + metric + ".csv"]
-
-        assert set(csv_files) == set(expected_files)
+        signals = ["confirmed", "deaths"]
+        metrics = [
+            "cumulative_num",
+            "cumulative_prop",
+            "incidence_num",
+            "incidence_prop",
+            "7dav_incidence_num",
+            "7dav_incidence_prop",
+            "7dav_cumulative_num",
+            "7dav_cumulative_prop",
+        ]
+
+        expected_files = {
+            date + "_" + geo + "_" + signal + "_" + metric + ".csv"
+            for date in dates
+            for geo in geos
+            for signal in signals
+            for metric in metrics
+        }
+
+        assert csv_files == expected_files
 
     def test_output_file_format(self, run_as_module):
 
         df = pd.read_csv(
             join("receiving", "20200310_state_confirmed_cumulative_num.csv")
         )
-        assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
+        assert (
+            df.columns.values
+            == [
+                "geo_id",
+                "val",
+                "se",
+                "sample_size",
+                "missing_val",
+                "missing_se",
+                "missing_sample_size",
+            ]
+        ).all()
+
+    def test_add_nancodes(self):
+        df = pd.DataFrame({
+            "timestamp": pd.date_range("20200321", "20200328"),
+            "geo_id": ["01017", "01043", "01061", "01103", "02282", "72001", "31000", "49000"],
+            "val": [0.1, 0.2, 0.3, 0.4, 0.5, np.nan, 0.7, np.nan],
+            "se": [np.nan] * 8,
+            "sample_size": [np.nan] * 8
+        }).set_index(["timestamp", "geo_id"])
+        expected_df = pd.DataFrame({
+            "timestamp": pd.date_range("20200321", "20200328"),
+            "geo_id": ["01017", "01043", "01061", "01103", "02282", "72001", "31000", "49000"],
+            "val": [0.1, 0.2, 0.3, 0.4, 0.5, np.nan, 0.7, np.nan],
+            "se": [np.nan] * 8,
+            "sample_size": [np.nan] * 8,
+            "missing_val": [Nans.NOT_MISSING] * 5 + [Nans.REGION_EXCEPTION, Nans.NOT_MISSING, Nans.UNKNOWN],
+            "missing_se": [Nans.NOT_APPLICABLE] * 8,
+            "missing_sample_size": [Nans.NOT_APPLICABLE] * 8,
+        }).set_index(["timestamp", "geo_id"])
+
+        pd.testing.assert_frame_equal(add_nancodes(df, "deaths", "county", None), expected_df)
+
+        df2 = pd.DataFrame({
+            "timestamp": pd.date_range("20200321", "20200328"),
+            "geo_id": ["01017", "01043", "01061", "01103", "02282", "72001", "31000", "49000"],
+            "val": [np.nan] * 6 + [0.7, np.nan],
+            "se": [np.nan] * 8,
+            "sample_size": [np.nan] * 8
+        }).set_index(["timestamp", "geo_id"])
+        expected_df2 = pd.DataFrame({
+            "timestamp": pd.date_range("20200321", "20200328"),
+            "geo_id": ["01017", "01043", "01061", "01103", "02282", "72001", "31000", "49000"],
+            "val": [np.nan] * 6 + [0.7, np.nan],
+            "se": [np.nan] * 8,
+            "sample_size": [np.nan] * 8,
+            "missing_val": [Nans.PRIVACY] * 5 + [Nans.REGION_EXCEPTION, Nans.NOT_MISSING, Nans.UNKNOWN],
+            "missing_se": [Nans.NOT_APPLICABLE] * 8,
+            "missing_sample_size": [Nans.NOT_APPLICABLE] * 8,
+        }).set_index(["timestamp", "geo_id"])
+
+        pd.testing.assert_frame_equal(add_nancodes(df2, "deaths", "county", "seven_day_average"), expected_df2)
+
+    def test_add_missing_current_day(self):
+        today = pd.Timestamp.today().date().strftime("%Y%m%d")
+        df = pd.DataFrame({
+            "timestamp": pd.to_datetime(["20200304", today]),
+            "geo_id": ["01017", "01061"],
+            "new_counts": [np.nan] * 2,
+            "cumulative_counts": [np.nan] * 2,
+            "population": [np.nan] * 2,
+            "incidence": [np.nan] * 2,
+            "cumulative_prop": [np.nan] * 2,
+            "val": [0.1, 0.3],
+            "se": [np.nan] * 2,
+            "sample_size": [np.nan] * 2
+        }).set_index(["timestamp", "geo_id"])
+        expected_df = pd.DataFrame({
+            "timestamp": pd.to_datetime(["20200304", today, today]),
+            "geo_id": ["01017", "01061", "01017"],
+            "new_counts": [np.nan] * 3,
+            "cumulative_counts": [np.nan] * 3,
+            "population": [np.nan] * 3,
+            "incidence": [np.nan] * 3,
+            "cumulative_prop": [np.nan] * 3,
+            "val": [0.1, 0.3, np.nan],
+            "se": [np.nan] * 3,
+            "sample_size": [np.nan] * 3
+        }).set_index(["timestamp", "geo_id"])
+
+        pd.testing.assert_frame_equal(add_missing_current_day(df), expected_df)