Merge pull request #456 from cmu-delphi/nchs_mortality

krivard · web-flow · commit 16cd3685dd2d · 2020-11-09T13:23:12.000-05:00
Update nchs_mortality pipeline with new signal names
diff --git a/nchs_mortality/DETAILS.md b/nchs_mortality/DETAILS.md
@@ -1,6 +1,6 @@
 # NCHS Mortality Data
 
-We import the NCHS Mortality Data from CDC website and export
+We import the Mortality Data from NCHS website and export
 the state-level data as-is in a weekly format.  
 
 In order to avoid confusing public consumers of the data, we maintain
@@ -57,6 +57,6 @@ as the corresponding epiweek of date(D + 1).
 ### Data Versioning
 Data versions are tracked on both a daily and weekly level.
 On a daily level, we check for updates for NCHS mortality data every weekday as how it is reported by 
-CDC and stash these daily updates on S3, but not our API.
+NCHS and stash these daily updates on S3, but not our API.
 On a weekly level (on Mondays), we additionally upload the changes to the data 
 made over the past week (due to backfill) to our API.
diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -22,6 +22,15 @@
         'pneumonia_deaths', 'pneumonia_and_covid_deaths', 'influenza_deaths',
         'pneumonia_influenza_or_covid_19_deaths'
 ]
+SENSOR_NAME_MAP = {
+        "covid_deaths": "deaths_covid_incidence",
+        "total_deaths": "deaths_allcause_incidence",
+        "percent_of_expected_deaths": "deaths_percent_of_expected",
+        "pneumonia_deaths": "deaths_pneumonia_notflu_incidence",
+        "pneumonia_and_covid_deaths": "deaths_covid_and_pneumonia_notflu_incidence",
+        "influenza_deaths": "deaths_flu_incidence",
+        "pneumonia_influenza_or_covid_19_deaths": "deaths_pneumonia_or_flu_or_covid_incidence"
+}
 SENSORS = [
         "num",
         "prop"
@@ -62,7 +71,7 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
             df["val"] = df[metric]
             df["se"] = np.nan
             df["sample_size"] = np.nan
-            sensor_name = "_".join(["wip", metric])
+            sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric]])
             export_csv(
                 df,
                 geo_name=GEO_RES,
@@ -79,7 +88,7 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
                     df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
                 df["se"] = np.nan
                 df["sample_size"] = np.nan
-                sensor_name = "_".join(["wip", metric, sensor])
+                sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric], sensor])
                 export_csv(
                     df,
                     geo_name=GEO_RES,
diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py
@@ -32,17 +32,19 @@ def test_output_files_exist(self, run_as_module, date):
                 "202025",
                 "202026",
             ]
-            metrics = [
-                    'covid_deaths', 'total_deaths', 'pneumonia_deaths',
-                    'pneumonia_and_covid_deaths', 'influenza_deaths',
-                    'pneumonia_influenza_or_covid_19_deaths'
-            ]
+            metrics = ['deaths_covid_incidence',
+                       'deaths_allcause_incidence',
+                       'deaths_percent_of_expected',
+                       'deaths_pneumonia_notflu_incidence',
+                       'deaths_covid_and_pneumonia_notflu_incidence',
+                       'deaths_flu_incidence',
+                       'deaths_pneumonia_or_flu_or_covid_incidence']
             sensors = ["num", "prop"]
 
             expected_files = []
             for date in dates:
                 for metric in metrics:
-                    if metric == "percent_of_expected_deaths":
+                    if metric == "deaths_percent_of_expected":
                         expected_files += ["weekly_" + date + "_state_wip_" \
                                            + metric + ".csv"]
                     else:
@@ -61,6 +63,6 @@ def test_output_file_format(self, run_as_module, date):
 
         for output_folder in folders:
             df = pd.read_csv(
-                join(output_folder, "weekly_202026_state_wip_covid_deaths_prop.csv")
+                join(output_folder, "weekly_202026_state_wip_deaths_covid_incidence_prop.csv")
             )
             assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()