cmu-delphi · krivard · Nov 18, 2020 · Nov 17, 2020 · Nov 17, 2020 · Nov 18, 2020
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -64,31 +64,23 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
                          "schema may have changed. Please investigate and "
                          "amend the code.") from exc
 
+    # Drop rows for locations outside US
     df = df[df["state"] != "United States"]
-    df.loc[df["state"] == "New York City", "state"] = "New York"
+    df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp")
 
-    state_list = df["state"].unique()
-    date_list = df["timestamp"].unique()
-    index_df = pd.MultiIndex.from_product(
-        [state_list, date_list], names=['state', 'timestamp']
-    )
-    df = df.groupby(
-            ["state", "timestamp"]).sum().reindex(index_df).reset_index()
-
-    # Final sanity checks
-    days_by_states = df.groupby("state").count()["covid_deaths"].unique()
-    unique_days = df["timestamp"].unique()
-    # each FIPS has same number of rows
-    if (len(days_by_states) > 1) or (days_by_states[0] != len(unique_days)):
-        raise ValueError("Differing number of days by fips")
-    min_timestamp = min(unique_days)
-    max_timestamp = max(unique_days)
-    n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, 'D') / 7 + 1
-    if n_days != len(unique_days):
-        raise ValueError(
-            f"Not every day between {min_timestamp} and "
-            "{max_timestamp} is represented."
-        )
+    # NCHS considers NYC as an individual state, however, we want it included
+    # in NY. If values are nan for both NYC and NY, the aggreagtion should
+    # also have NAN.
+    df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1)
+    df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1)
+    # Get mask df to ignore cells where both of them have NAN values
+    mask = (df_ny[keep_columns].isnull().values \
+            & df_nyc[keep_columns].isnull().values)
+    df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
+    df_ny["state"] = "New York"
+    # Drop NYC and NY in the full dataset
+    df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
+    df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
 
     # Add population info
     keep_columns.extend(["timestamp", "geo_id", "population"])

diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -17,7 +17,7 @@
 from .constants import (METRICS, SENSOR_NAME_MAP,
                         SENSORS, INCIDENCE_BASE, GEO_RES)
 
-def run_module():  # pylint: disable=too-many-branches,too-many-statements
+def run_module():
     """Run module for processing NCHS mortality data."""
     params = read_params()
     export_start_date = params["export_start_date"]
@@ -48,6 +48,7 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
             df["val"] = df[metric]
             df["se"] = np.nan
             df["sample_size"] = np.nan
+            df = df[~df["val"].isnull()]
             sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric]])
             export_csv(
                 df,
@@ -65,6 +66,7 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
                     df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
                 df["se"] = np.nan
                 df["sample_size"] = np.nan
+                df = df[~df["val"].isnull()]
                 sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric], sensor])
                 export_csv(
                     df,
@@ -74,10 +76,10 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
                     sensor=sensor_name,
                 )
 
-    # Weekly run of archive utility on Monday
-    # - Does not upload to S3, that is handled by daily run of archive utility
-    # - Exports issues into receiving for the API
-    # Daily run of archiving utility
-    # - Uploads changed files to S3
-    # - Does not export any issues into receiving
+#     Weekly run of archive utility on Monday
+#     - Does not upload to S3, that is handled by daily run of archive utility
+#     - Exports issues into receiving for the API
+#     Daily run of archiving utility
+#     - Uploads changed files to S3
+#     - Does not export any issues into receiving
     arch_diffs(params, daily_arch_diff)
diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py
@@ -20,17 +20,13 @@ def test_output_files_exist(self, run_as_module, date):
             csv_files = listdir(output_folder)
 
             dates = [
-                "202016",
-                "202017",
-                "202018",
-                "202019",
-                "202020",
-                "202021",
-                "202022",
-                "202023",
-                "202024",
-                "202025",
-                "202026",
+                "202030",
+                "202031",
+                "202032",
+                "202033",
+                "202034",
+                "202035",
+                "202036",
             ]
             metrics = ['deaths_covid_incidence',
                        'deaths_allcause_incidence',