Merge branch 'main' of https://github.com/cmu-delphi/covidcast-indicators into even_more_logging

benjaminysmith · benjaminysmith · commit 2ddcbee3e34e · 2021-01-25T16:26:42.000-05:00
diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py
@@ -528,6 +528,17 @@ def fips_to_megacounty(
         data = data.reset_index().groupby([date_col, mega_col]).sum()
         return data.reset_index()
 
+    def as_mapper_name(self, geo_type, state="state_id"):
+        """
+        Return the mapper equivalent of a region type.
+
+        Human-readable names like 'county' will return their mapper equivalents ('fips').
+        """
+        if geo_type == "state":
+            return state
+        if geo_type == "county":
+            return "fips"
+        return geo_type
     def get_geo_values(self, geo_type):
         """
         Return a set of all values for a given geography type.
diff --git a/changehc/delphi_changehc/config.py b/changehc/delphi_changehc/config.py
@@ -54,26 +54,3 @@ class Config:
         7  # maximum number of days used to average a backfill correction
     )
     MIN_CUM_VISITS = 500  # need to observe at least 500 counts before averaging
-
-
-class Constants:
-    """
-    Contains the maximum number of geo units for each geo type.
-
-    Used for sanity checks
-    """
-
-    # number of counties in usa, including megacounties
-    NUM_COUNTIES = 3141 + 52
-    NUM_HRRS = 308
-    NUM_MSAS = 392 + 52  # MSA + States
-    NUM_STATES = 52  # including DC and PR
-    NUM_NATIONS = 1
-    NUM_HHSS = 10
-
-    MAX_GEO = {"county": NUM_COUNTIES,
-               "hrr": NUM_HRRS,
-               "msa": NUM_MSAS,
-               "state": NUM_STATES,
-               "nation": NUM_NATIONS,
-               "hhs": NUM_HHSS}
diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py
@@ -14,7 +14,7 @@
 from delphi_utils import GeoMapper, read_params, add_prefix
 
 # first party
-from .config import Config, Constants
+from .config import Config
 from .constants import SMOOTHED, SMOOTHED_ADJ, SMOOTHED_CLI, SMOOTHED_ADJ_CLI, NA
 from .sensor import CHCSensor
 from .weekday import Weekday
@@ -164,7 +164,7 @@ def geo_reindex(self, data):
         # for each location, fill in all missing dates with 0 values
         multiindex = pd.MultiIndex.from_product((unique_geo_ids, self.fit_dates),
                                                 names=[geo, Config.DATE_COL])
-        assert (len(multiindex) <= (Constants.MAX_GEO[geo] * len(self.fit_dates))
+        assert (len(multiindex) <= (len(gmpr.get_geo_values(gmpr.as_mapper_name(geo))) * len(self.fit_dates))
                 ), "more loc-date pairs than maximum number of geographies x number of dates"
         # fill dataframe with missing dates using 0
         data_frame = data_frame.reindex(multiindex, fill_value=0)
diff --git a/changehc/tests/test_load_data.py b/changehc/tests/test_load_data.py
@@ -2,15 +2,14 @@
 import pytest
 
 # third party
-from delphi_utils import read_params
+from delphi_utils import read_params, GeoMapper
 import pandas as pd
 
 # first party
-from delphi_changehc.config import Config, Constants
+from delphi_changehc.config import Config
 from delphi_changehc.load_data import *
 
 CONFIG = Config()
-CONSTANTS = Constants()
 PARAMS = read_params()
 COVID_FILEPATH = PARAMS["input_covid_file"]
 DENOM_FILEPATH = PARAMS["input_denom_file"]
@@ -24,6 +23,7 @@ class TestLoadData:
                     Config.COVID_COLS, Config.COVID_DTYPES, Config.COVID_COL)
     combined_data = load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE,
                                             "fips")
+    gmpr = GeoMapper()
 
     def test_base_unit(self):
         with pytest.raises(AssertionError):
@@ -78,7 +78,7 @@ def test_fips_values(self):
                      self.combined_data]:
             assert (
                     len(data.index.get_level_values(
-                        'fips').unique()) <= CONSTANTS.NUM_COUNTIES
+                        'fips').unique()) <= len(self.gmpr.get_geo_values("fips"))
             )
 
     def test_combined_fips_values(self):
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -15,11 +15,10 @@
 from delphi_utils import read_params
 
 # first party
-from delphi_changehc.config import Config, Constants
+from delphi_changehc.config import Config
 from delphi_changehc.update_sensor import write_to_csv, CHCSensorUpdator
 
 CONFIG = Config()
-CONSTANTS = Constants()
 PARAMS = read_params()
 COVID_FILEPATH = PARAMS["input_covid_file"]
 DENOM_FILEPATH = PARAMS["input_denom_file"]
diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py
@@ -1,16 +1,22 @@
 """Registry for constants."""
 # global constants
 METRICS = [
-        "covid_deaths", "total_deaths", "percent_of_expected_deaths",
-        "pneumonia_deaths", "pneumonia_and_covid_deaths", "influenza_deaths",
+        "covid_19_deaths", "total_deaths", "percent_of_expected_deaths",
+        "pneumonia_deaths", "pneumonia_and_covid_19_deaths", "influenza_deaths",
         "pneumonia_influenza_or_covid_19_deaths"
 ]
+RENAME = [
+    ("start_week", "timestamp"),
+    ("start_date", "timestamp"),
+    ("covid_deaths", "covid_19_deaths"),
+    ("pneumonia_and_covid_deaths", "pneumonia_and_covid_19_deaths")
+]
 SENSOR_NAME_MAP = {
-        "covid_deaths": "deaths_covid_incidence",
+        "covid_19_deaths": "deaths_covid_incidence",
         "total_deaths": "deaths_allcause_incidence",
         "percent_of_expected_deaths": "deaths_percent_of_expected",
         "pneumonia_deaths": "deaths_pneumonia_notflu_incidence",
-        "pneumonia_and_covid_deaths": "deaths_covid_and_pneumonia_notflu_incidence",
+        "pneumonia_and_covid_19_deaths": "deaths_covid_and_pneumonia_notflu_incidence",
         "influenza_deaths": "deaths_flu_incidence",
         "pneumonia_influenza_or_covid_19_deaths": "deaths_pneumonia_or_flu_or_covid_incidence"
 }
@@ -20,3 +26,8 @@
 ]
 INCIDENCE_BASE = 100000
 GEO_RES = "state"
+
+# this is necessary as a delimiter in the f-string expressions we use to
+# construct detailed error reports
+# (https://www.python.org/dev/peps/pep-0498/#escape-sequences)
+NEWLINE = "\n"
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -3,7 +3,18 @@
 import numpy as np
 import pandas as pd
 from sodapy import Socrata
-from .constants import METRICS
+from .constants import METRICS, RENAME, NEWLINE
+
+def standardize_columns(df):
+    """Rename columns to comply with a standard set.
+
+    NCHS has changed column names a few times, so this will help us maintain
+    backwards-compatibility without the processing code getting all gnarly.
+    """
+    rename_pairs = [(from_col, to_col) for (from_col, to_col) in RENAME
+                     if from_col in df.columns]
+    return df.rename(columns=dict(rename_pairs))
+
 
 def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
     """Pull the latest NCHS Mortality data, and conforms it into a dataset.
@@ -42,25 +53,44 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
         # Pull data from Socrata API
         client = Socrata("data.cdc.gov", token)
         results = client.get("r8kw-7aab", limit=10**10)
-        df = pd.DataFrame.from_records(results).rename(
-                {"start_week": "timestamp"}, axis=1)
+        df = pd.DataFrame.from_records(results)
+        # drop "By Total" rows
+        df = df[df["group"].transform(str.lower) == "by week"]
     else:
         df = pd.read_csv("./test_data/%s"%test_mode)
 
-    # Check missing start_week == end_week
-    try:
-        assert sum(df["timestamp"] != df["end_week"]) == 0
-    except AssertionError as exc:
-        raise ValueError(
-            "end_week is not always the same as start_week, check the raw file"
-        ) from exc
+    df = standardize_columns(df)
+
+    if "end_date" in df.columns:
+        # Check missing week_ending_date == end_date
+        try:
+            assert all(df["week_ending_date"] == df["end_date"])
+        except AssertionError as exc:
+            raise ValueError(
+                "week_ending_date is not always the same as end_date, check the raw file"
+            ) from exc
+    else:
+        # Check missing start_week == end_week
+        try:
+            assert all(df["timestamp"] == df["end_week"])
+        except AssertionError as exc:
+            raise ValueError(
+                "end_week is not always the same as start_week, check the raw file"
+            ) from exc
 
     try:
         df = df.astype(type_dict)
     except KeyError as exc:
-        raise ValueError("Expected column(s) missed, The dataset "
-                         "schema may have changed. Please investigate and "
-                         "amend the code.") from exc
+        raise ValueError(f"""
+Expected column(s) missed, The dataset schema may
+have changed. Please investigate and amend the code.
+
+Columns needed:
+{NEWLINE.join(type_dict.keys())}
+
+Columns available:
+{NEWLINE.join(df.columns)}
+""") from exc
 
     # Drop rows for locations outside US
     df = df[df["state"] != "United States"]
diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -6,10 +6,10 @@
 """
 from datetime import datetime, date, timedelta
 from os.path import join
+import time
 
 import numpy as np
 import pandas as pd
-import time
 from delphi_utils import read_params, S3ArchiveDiffer, get_structured_logger
 
 from .pull import pull_nchs_mortality_data
@@ -34,11 +34,13 @@ def run_module():
     token = params["token"]
     test_mode = params["mode"]
 
-    daily_arch_diff = S3ArchiveDiffer(
-        daily_cache_dir, daily_export_dir,
-        params["bucket_name"], "nchs_mortality",
-        params["aws_credentials"])
-    daily_arch_diff.update_cache()
+    if params["bucket_name"]:
+        daily_arch_diff = S3ArchiveDiffer(
+            daily_cache_dir, daily_export_dir,
+            params["bucket_name"], "nchs_mortality",
+            params["aws_credentials"])
+        daily_arch_diff.update_cache()
+
 
     map_df = pd.read_csv(
         join(static_file_dir, "state_pop.csv"), dtype={"fips": int}
@@ -87,8 +89,9 @@ def run_module():
 #     Daily run of archiving utility
 #     - Uploads changed files to S3
 #     - Does not export any issues into receiving
-    arch_diffs(params, daily_arch_diff)
-    
+    if params["bucket_name"]:
+        arch_diffs(params, daily_arch_diff)
+
     elapsed_time_in_seconds = round(time.time() - start_time, 2)
     logger.info("Completed indicator run",
         elapsed_time_in_seconds = elapsed_time_in_seconds)
diff --git a/nchs_mortality/tests/test_data/test_data.csv b/nchs_mortality/tests/test_data/test_data.csv
@@ -1,4 +1,4 @@
-data_as_of,timestamp,end_week,group,state,indicator,covid_deaths,total_deaths,percent_of_expected_deaths,pneumonia_deaths,pneumonia_and_covid_deaths,influenza_deaths,pneumonia_influenza_or_covid_19_deaths,footnote
+data_as_of,start_week,end_week,group,state,indicator,covid_deaths,total_deaths,percent_of_expected_deaths,pneumonia_deaths,pneumonia_and_covid_19_deaths,influenza_deaths,pneumonia_influenza_or_covid_19_deaths,footnote
 2020-09-09T00:00:00.000,2020-02-01T00:00:00.000,2020-02-01T00:00:00.000,By week,United States,Week-ending,0,58570,0.99,3796,0,479,4275,
 2020-09-09T00:00:00.000,2020-02-08T00:00:00.000,2020-02-08T00:00:00.000,By week,United States,Week-ending,1,59286,0.99,3798,0,520,4319,
 2020-09-09T00:00:00.000,2020-02-15T00:00:00.000,2020-02-15T00:00:00.000,By week,United States,Week-ending,0,58691,1,3824,0,558,4382,
diff --git a/nchs_mortality/tests/test_pull.py b/nchs_mortality/tests/test_pull.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from delphi_utils import read_params
 
-from delphi_nchs_mortality.pull import pull_nchs_mortality_data
+from delphi_nchs_mortality.pull import pull_nchs_mortality_data, standardize_columns
 from delphi_nchs_mortality.constants import METRICS
 
 params = read_params()
@@ -19,18 +19,35 @@
 )
 
 class TestPullNCHS:
+    def test_standardize_columns(self):
+        df = standardize_columns(
+            pd.DataFrame({
+                "start_week": [1],
+                "covid_deaths": [2],
+                "pneumonia_and_covid_deaths": [4],
+                "pneumonia_influenza_or_covid_19_deaths": [8]
+            }))
+        expected = pd.DataFrame({
+            "timestamp": [1],
+            "covid_19_deaths": [2],
+            "pneumonia_and_covid_19_deaths": [4],
+            "pneumonia_influenza_or_covid_19_deaths": [8]
+        })
+        pd.testing.assert_frame_equal(expected, df)
+        
     def test_good_file(self):
         df = pull_nchs_mortality_data(token, map_df, "test_data.csv")
         
         # Test columns
         assert (df.columns.values == [
-                'covid_deaths', 'total_deaths', 'percent_of_expected_deaths',
-                'pneumonia_deaths', 'pneumonia_and_covid_deaths',
+                'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths',
+                'pneumonia_deaths', 'pneumonia_and_covid_19_deaths',
                 'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths',
                 "timestamp", "geo_id", "population"]).all()
     
         # Test aggregation for NYC and NY
-        raw_df = pd.read_csv("./test_data/test_data.csv", parse_dates=["timestamp"])
+        raw_df = pd.read_csv("./test_data/test_data.csv", parse_dates=["start_week"])
+        raw_df = standardize_columns(raw_df)
         for metric in METRICS:
             ny_list = raw_df.loc[(raw_df["state"] == "New York")
                                 & (raw_df[metric].isnull()), "timestamp"].values
@@ -62,4 +79,4 @@ def test_bad_file_with_inconsistent_time_col(self):
                                           "bad_data_with_missing_cols.csv")
 
     
-        
+
diff --git a/validator/README.md b/validator/README.md
@@ -55,7 +55,7 @@ Please update the follow settings:
    * `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls
    * `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date
    * `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days
-   * `suppressed_errors`: list of lists uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected
+   * `suppressed_errors`: list of pairs of (`check_name`, `file_name`) uniquely specifying errors that have been manually verified as false positives or acceptable deviations from expected.  Either value can also take on the value `*` to apply to all check or file names.
    * `test_mode`: boolean; `true` checks only a small number of data files
 * `static`: settings for validations that don't require comparison with external COVIDcast API data
    * `minimum_sample_size` (default: 100): threshold for flagging small sample sizes as invalid
diff --git a/validator/delphi_validator/errors.py b/validator/delphi_validator/errors.py
@@ -36,7 +36,13 @@ def is_suppressed(self, suppressed_errors):
         errors_to_suppress: Set[Tuple[str]]
             set of (check_name, data_name) tuples to ignore.
         """
-        return (self.check_name, self.data_name) in suppressed_errors
+        if (self.check_name, self.data_name) in suppressed_errors:
+            return True
+        if (self.check_name, "*") in suppressed_errors:
+            return True
+        if ("*", self.data_name) in suppressed_errors:
+            return True
+        return False
 
     def __str__(self):
         return f"{self.check_name} failed for {self.data_name}: {self.message}"
diff --git a/validator/delphi_validator/static.py b/validator/delphi_validator/static.py
diff --git a/validator/tests/test_errors.py b/validator/tests/test_errors.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-data_as_of,timestamp,end_week,group,state,indicator,covid_deaths,total_deaths,percent_of_expected_deaths,pneumonia_deaths,pneumonia_and_covid_deaths,influenza_deaths,pneumonia_influenza_or_covid_19_deaths,footnote`
	`1`	`+data_as_of,start_week,end_week,group,state,indicator,covid_deaths,total_deaths,percent_of_expected_deaths,pneumonia_deaths,pneumonia_and_covid_19_deaths,influenza_deaths,pneumonia_influenza_or_covid_19_deaths,footnote`
`2`	`2`	`2020-09-09T00:00:00.000,2020-02-01T00:00:00.000,2020-02-01T00:00:00.000,By week,United States,Week-ending,0,58570,0.99,3796,0,479,4275,`
`3`	`3`	`2020-09-09T00:00:00.000,2020-02-08T00:00:00.000,2020-02-08T00:00:00.000,By week,United States,Week-ending,1,59286,0.99,3798,0,520,4319,`
`4`	`4`	`2020-09-09T00:00:00.000,2020-02-15T00:00:00.000,2020-02-15T00:00:00.000,By week,United States,Week-ending,0,58691,1,3824,0,558,4382,`