Merge pull request #551 from cmu-delphi/main

krivard · web-flow · commit 071aa2bed2af · 2020-11-18T18:37:54.000-05:00
Deploy `chng` to production
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -9,7 +9,7 @@
    - Keep in sync with '.github/workflows/python-ci.yml'.
    - TODO: #527 Get this list automatically from python-ci.yml at runtime.
  */
-def indicator_list = ["cdc_covidnet", "claims_hosp", "combo_cases_and_deaths", "google_symptoms", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "usafacts"]
+def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "google_symptoms", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "usafacts"]
 def build_package = [:]
 def deploy_staging = [:]
 def deploy_production = [:]
diff --git a/ansible/templates/safegraph_patterns-params-prod.json.j2 b/ansible/templates/safegraph_patterns-params-prod.json.j2
@@ -0,0 +1,13 @@
+{
+  "static_file_dir": "./static",
+  "raw_data_dir": "/common/safegraph",
+  "export_dir": "./receiving",
+  "cache_dir": "./cache",
+  "n_core": "12",
+  "aws_access_key_id": "{{ safegraph_aws_access_key_id }}",
+  "aws_secret_access_key": "{{ safegraph_aws_secret_access_key }}",
+  "aws_default_region": "us-east-1",
+  "aws_endpoint": "https://s3.wasabisys.com",
+  "sync": true,
+  "wip_signal" : []
+}
diff --git a/google_symptoms/DETAILS.md b/google_symptoms/DETAILS.md
@@ -17,7 +17,7 @@ hrr level data is derived from county level data using population weighted avera
 ## Metrics, Level 1 (`m1`)
 * `anosmia`: Google search volume for Anosmia-related searches
 * `ageusia`: Google search volume for Ageusia-related searches
-*`combined_symptoms`*: The sum of Google search volume for Anosmia-related searches and  Ageusia-related searches.
+*`sum_anosmia_ageusia`*: The sum of Google search volume for Anosmia-related searches and  Ageusia-related searches.
 
 ## Metrics, Level 2 (`m2`)
 * `raw_search`:  Google search volume reported as-is
diff --git a/google_symptoms/delphi_google_symptoms/constants.py b/google_symptoms/delphi_google_symptoms/constants.py
@@ -9,6 +9,7 @@
 
 # global constants
 METRICS = ["Anosmia", "Ageusia"]
+COMBINED_METRIC = "sum_anosmia_ageusia"
 SMOOTHERS = ["raw", "smoothed"]
 GEO_RESOLUTIONS = [
         "state",
diff --git a/google_symptoms/delphi_google_symptoms/geo.py b/google_symptoms/delphi_google_symptoms/geo.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pandas as pd
 from delphi_utils import GeoMapper
-from .constants import METRICS
+from .constants import METRICS, COMBINED_METRIC
 
 gmpr = GeoMapper()
 def generate_transition_matrix(geo_res):
@@ -64,7 +64,7 @@ def geo_map(df, geo_res):
     for _date in df["timestamp"].unique():
         val_lists = df[df["timestamp"] == _date].merge(
                 map_df["geo_id"], how="right"
-                )[METRICS + ["combined_symptoms"]].fillna(0)
+                )[METRICS + [COMBINED_METRIC]].fillna(0)
         newdf = pd.DataFrame(
                 np.matmul(map_df.values[:, 1:].T, val_lists.values),
                 columns = list(val_lists.keys())
diff --git a/google_symptoms/delphi_google_symptoms/pull.py b/google_symptoms/delphi_google_symptoms/pull.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 
-from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS
+from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS, COMBINED_METRIC
 
 def get_geo_id(region_code):
     """
@@ -42,16 +42,16 @@ def preprocess(df, level):
         Dataframe as described above.
     """
     # Constants
-    KEEP_COLUMNS = ["geo_id", "date"] + METRICS + ["combined_symptoms"]
+    KEEP_COLUMNS = ["geo_id", "date"] + METRICS + [COMBINED_METRIC]
 
-    df["combined_symptoms"] = 0
+    df[COMBINED_METRIC] = 0
     for metric in METRICS:
         df.rename({"symptom:" + metric: metric}, axis = 1, inplace = True)
-        df["combined_symptoms"] += df[metric].fillna(0)
+        df[COMBINED_METRIC] += df[metric].fillna(0)
     df.loc[
             (df["Anosmia"].isnull())
             & (df["Ageusia"].isnull())
-            , "combined_symptoms"] = np.nan
+            , COMBINED_METRIC] = np.nan
 
     # Delete rows with missing FIPS
     null_mask = (df["geo_id"].isnull())
diff --git a/google_symptoms/delphi_google_symptoms/run.py b/google_symptoms/delphi_google_symptoms/run.py
@@ -12,7 +12,8 @@
 
 from .pull import pull_gs_data
 from .geo import geo_map
-from .constants import METRICS, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
+from .constants import (METRICS, COMBINED_METRIC,
+                        GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP)
 
 
 def run_module():
@@ -31,7 +32,7 @@ def run_module():
             df_pull = dfs["county"]
             df_pull = geo_map(df_pull, geo_res)
         for metric, smoother in product(
-                METRICS+["combined_symptoms"], SMOOTHERS):
+                METRICS+[COMBINED_METRIC], SMOOTHERS):
             print(geo_res, metric, smoother)
             df = df_pull.set_index(["timestamp", "geo_id"])
             df["val"] = df[metric].groupby(level=1
diff --git a/google_symptoms/tests/test_geo.py b/google_symptoms/tests/test_geo.py
@@ -0,0 +1,73 @@
+import pytest
+
+from os.path import join
+
+import numpy as np
+import pandas as pd
+
+from delphi_google_symptoms.geo import geo_map
+from delphi_google_symptoms.constants import METRICS, COMBINED_METRIC
+
+class TestGeo:
+    def test_fips(self):
+        df = pd.DataFrame(
+            {
+                "geo_id": ["53003", "48027", "50103"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
+                METRICS[0]: [10, 15, 2],
+                METRICS[1]: [100, 20, 45],
+                COMBINED_METRIC: [110, 35, 47],
+            }
+        )
+        new_df = geo_map(df, "county")
+        
+        assert set(new_df.keys()) == set(df.keys())
+        assert (new_df[METRICS[0]] == df[METRICS[0]]).all()
+        assert (new_df[METRICS[1]] == df[METRICS[1]]).all()
+        assert (new_df[COMBINED_METRIC] == df[COMBINED_METRIC]).all()
+        
+    def test_hrr(self):
+        df = pd.DataFrame(
+            {
+                "geo_id": ["01001", "01009", "01007"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
+                METRICS[0]: [10, 15, 2],
+                METRICS[1]: [100, 20, 45],
+                COMBINED_METRIC: [110, 35, 47],
+            }
+        )
+        new_df = geo_map(df, "hrr").dropna()
+        
+        assert set(new_df.keys()) == set(df.keys())
+        assert set(new_df["geo_id"]) == set(["1", "5", "7", "9"])
+        assert new_df[METRICS[0]].values == pytest.approx([0.39030655604059333, 
+                                                          0.014572815050225169,
+                                                          1.1509470322941868,
+                                                          0.08525105356979307])
+        assert new_df[METRICS[1]].values == pytest.approx([0.7973533171562179,
+                                                   0.019430420066966894,
+                                                   11.509470322941867,
+                                                   1.918148705320344])
+        assert new_df[COMBINED_METRIC].values == pytest.approx(
+                new_df[METRICS[0]].values + new_df[METRICS[1]])
+        
+    def test_msa(self):
+        df = pd.DataFrame(
+            {
+                "geo_id": ["01001", "01009", "01007"],
+                "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
+                METRICS[0]: [10, 15, 2],
+                METRICS[1]: [100, 20, 45],
+                COMBINED_METRIC: [110, 35, 47],
+            }
+        )
+        new_df = geo_map(df, "msa").dropna()
+        
+        assert set(new_df.keys()) == set(df.keys())
+        assert set(new_df["geo_id"]) == set(["13820", "33860"])
+        assert new_df[METRICS[0]].values == pytest.approx([0.8365267072315176,
+                                                          1.4966647914490074])
+        assert new_df[METRICS[1]].values == pytest.approx([1.9847583762443426,
+                                                          14.966647914490075])
+        assert new_df[COMBINED_METRIC].values == pytest.approx(
+                new_df[METRICS[0]].values + new_df[METRICS[1]])
diff --git a/google_symptoms/tests/test_pull.py b/google_symptoms/tests/test_pull.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 from delphi_google_symptoms.pull import pull_gs_data, preprocess
+from delphi_google_symptoms.constants import METRICS, COMBINED_METRIC
 
 base_url_good = "./test_data{sub_url}small_{state}symptoms_dataset.csv"
 
@@ -20,23 +21,23 @@ def test_good_file(self):
             df = dfs[level]
             assert (
                 df.columns.values
-                == ["geo_id", "timestamp", "Anosmia", "Ageusia", "combined_symptoms"]
+                == ["geo_id", "timestamp"] + METRICS + [COMBINED_METRIC]
             ).all()
     
             # combined_symptoms is nan when both Anosmia and Ageusia are nan
             assert sum(~df.loc[
-                                  (df["Anosmia"].isnull())
-                                  & (df["Ageusia"].isnull())
-                               , "combined_symptoms"].isnull()) == 0
+                                  (df[METRICS[0]].isnull())
+                                  & (df[METRICS[1]].isnull())
+                               , COMBINED_METRIC].isnull()) == 0
             # combined_symptoms is not nan when either Anosmia or Ageusia isn't nan
             assert sum(df.loc[
-                                  (~df["Anosmia"].isnull())
-                                  & (df["Ageusia"].isnull())
-                              , "combined_symptoms"].isnull()) == 0
+                                  (~df[METRICS[0]].isnull())
+                                  & (df[METRICS[1]].isnull())
+                              , COMBINED_METRIC].isnull()) == 0
             assert sum(df.loc[
-                                  (df["Anosmia"].isnull())
-                                  & (~df["Ageusia"].isnull())
-                              , "combined_symptoms"].isnull()) == 0
+                                  (df[METRICS[0]].isnull())
+                                  & (~df[METRICS[1]].isnull())
+                              , COMBINED_METRIC].isnull()) == 0
 
     def test_missing_cols(self):        
         df = pd.read_csv(base_url_bad["missing_cols"])       
diff --git a/google_symptoms/tests/test_run.py b/google_symptoms/tests/test_run.py
@@ -23,7 +23,7 @@ def test_output_files_exist(self, run_as_module):
             "20200811"
         ]
         geos = ["county", "state"]
-        metrics = ["anosmia", "ageusia", "combined_symptoms"]
+        metrics = ["anosmia", "ageusia", "sum_anosmia_ageusia"]
         smoother = ["raw", "smoothed"]
 
         expected_files = []
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 from sodapy import Socrata
+from .constants import METRICS
 
 def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
     """Pull the latest NCHS Mortality data, and conforms it into a dataset.
@@ -33,10 +34,7 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
         Dataframe as described above.
     """
     # Constants
-    keep_columns = ['covid_deaths', 'total_deaths',
-                    'percent_of_expected_deaths', 'pneumonia_deaths',
-                    'pneumonia_and_covid_deaths', 'influenza_deaths',
-                    'pneumonia_influenza_or_covid_19_deaths']
+    keep_columns = METRICS.copy()
     type_dict = {key: float for key in keep_columns}
     type_dict["timestamp"] = 'datetime64[ns]'
 
@@ -64,31 +62,23 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
                          "schema may have changed. Please investigate and "
                          "amend the code.") from exc
 
+    # Drop rows for locations outside US
     df = df[df["state"] != "United States"]
-    df.loc[df["state"] == "New York City", "state"] = "New York"
+    df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp")
 
-    state_list = df["state"].unique()
-    date_list = df["timestamp"].unique()
-    index_df = pd.MultiIndex.from_product(
-        [state_list, date_list], names=['state', 'timestamp']
-    )
-    df = df.groupby(
-            ["state", "timestamp"]).sum().reindex(index_df).reset_index()
-
-    # Final sanity checks
-    days_by_states = df.groupby("state").count()["covid_deaths"].unique()
-    unique_days = df["timestamp"].unique()
-    # each FIPS has same number of rows
-    if (len(days_by_states) > 1) or (days_by_states[0] != len(unique_days)):
-        raise ValueError("Differing number of days by fips")
-    min_timestamp = min(unique_days)
-    max_timestamp = max(unique_days)
-    n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, 'D') / 7 + 1
-    if n_days != len(unique_days):
-        raise ValueError(
-            f"Not every day between {min_timestamp} and "
-            "{max_timestamp} is represented."
-        )
+    # NCHS considers NYC as an individual state, however, we want it included
+    # in NY. If values are nan for both NYC and NY, the aggreagtion should
+    # also have NAN.
+    df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1)
+    df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1)
+    # Get mask df to ignore cells where both of them have NAN values
+    mask = (df_ny[keep_columns].isnull().values \
+            & df_nyc[keep_columns].isnull().values)
+    df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
+    df_ny["state"] = "New York"
+    # Drop NYC and NY in the full dataset
+    df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
+    df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
 
     # Add population info
     keep_columns.extend(["timestamp", "geo_id", "population"])
diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -17,7 +17,7 @@
 from .constants import (METRICS, SENSOR_NAME_MAP,
                         SENSORS, INCIDENCE_BASE, GEO_RES)
 
-def run_module():  # pylint: disable=too-many-branches,too-many-statements
+def run_module():
     """Run module for processing NCHS mortality data."""
     params = read_params()
     export_start_date = params["export_start_date"]
@@ -41,13 +41,15 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
         join(static_file_dir, "state_pop.csv"), dtype={"fips": int}
     )
 
-    df = pull_nchs_mortality_data(token, map_df, test_mode)
+    df_pull = pull_nchs_mortality_data(token, map_df, test_mode)
     for metric in METRICS:
         if metric == 'percent_of_expected_deaths':
             print(metric)
+            df = df_pull.copy()
             df["val"] = df[metric]
             df["se"] = np.nan
             df["sample_size"] = np.nan
+            df = df[~df["val"].isnull()]
             sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric]])
             export_csv(
                 df,
@@ -59,12 +61,14 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
         else:
             for sensor in SENSORS:
                 print(metric, sensor)
+                df = df_pull.copy()
                 if sensor == "num":
                     df["val"] = df[metric]
                 else:
                     df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
                 df["se"] = np.nan
                 df["sample_size"] = np.nan
+                df = df[~df["val"].isnull()]
                 sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric], sensor])
                 export_csv(
                     df,
@@ -74,10 +78,10 @@ def run_module():  # pylint: disable=too-many-branches,too-many-statements
                     sensor=sensor_name,
                 )
 
-    # Weekly run of archive utility on Monday
-    # - Does not upload to S3, that is handled by daily run of archive utility
-    # - Exports issues into receiving for the API
-    # Daily run of archiving utility
-    # - Uploads changed files to S3
-    # - Does not export any issues into receiving
+#     Weekly run of archive utility on Monday
+#     - Does not upload to S3, that is handled by daily run of archive utility
+#     - Exports issues into receiving for the API
+#     Daily run of archiving utility
+#     - Uploads changed files to S3
+#     - Does not export any issues into receiving
     arch_diffs(params, daily_arch_diff)
diff --git a/nchs_mortality/tests/test_pull.py b/nchs_mortality/tests/test_pull.py
diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py
diff --git a/safegraph_patterns/run-safegraph_patterns.sh b/safegraph_patterns/run-safegraph_patterns.sh

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ def test_output_files_exist(self, run_as_module):`
`23`	`23`	`"20200811"`
`24`	`24`	`]`
`25`	`25`	`geos = ["county", "state"]`
`26`		`- metrics = ["anosmia", "ageusia", "combined_symptoms"]`
	`26`	`+ metrics = ["anosmia", "ageusia", "sum_anosmia_ageusia"]`
`27`	`27`	`smoother = ["raw", "smoothed"]`
`28`	`28`
`29`	`29`	`expected_files = []`