cmu-delphi · krivard · Apr 12, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.35
+current_version = 0.3.36
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.12
+current_version = 0.3.13
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False

diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py
@@ -15,4 +15,4 @@
 from .nancodes import Nans
 from .weekday import Weekday
 
-__version__ = "0.3.12"
+__version__ = "0.3.13"
diff --git a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py
@@ -147,7 +147,7 @@ def output(evd_ranking, day, lag, signal, logger):
     """
     starter_link = f"{HTML_LINK}{(day+pd.Timedelta(f'{lag}d')).strftime('%Y-%m_%d')}"
     p_text = ""
-    for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).iteritems()):
+    for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).items()):
         if j < 30:
             start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}"
             p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n"

diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py
@@ -401,9 +401,9 @@ def replace_geocode(
             df.drop("weight", axis=1, inplace=True)
 
         if not date_col is None:
-            df = df.groupby([date_col, new_col]).sum().reset_index()
+            df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index()
         else:
-            df = df.groupby([new_col]).sum().reset_index()
+            df = df.groupby([new_col]).sum(numeric_only=True).reset_index()
         return df
 
     def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True):
@@ -501,7 +501,7 @@ def fips_to_megacounty(
         )
         data.set_index([fips_col, date_col], inplace=True)
         data = data.join(mega_data)
-        data = data.reset_index().groupby([date_col, mega_col]).sum()
+        data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True)
         return data.reset_index()
 
     def as_mapper_name(self, geo_type, state="state_id"):

diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py
@@ -195,7 +195,7 @@ def replace_first_six(df, start_date):
             start_date = self.params.time_window.start_date)
 
         if not error_df.empty:
-            for index, value in error_df.iteritems():
+            for index, value in error_df.items():
                 report.add_raised_error(
                     ValidationFailure("check_val_missing",
                                       geo_type=geo_type,

diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
@@ -14,7 +14,7 @@
     "mock",
     "moto",
     "numpy",
-    "pandas>=1.1.0,<2",
+    "pandas>=1.1.0",
     "pydocstyle",
     "pylint==2.8.3",
     "pytest",
@@ -26,7 +26,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.3.12",
+    version="0.3.13",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py
@@ -250,15 +250,15 @@ def test_export_with_null_removal(self):
         """Test that `remove_null_samples = True` removes entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append(
-            {
+        df_with_nulls = pd.concat(
+            [self.DF.copy(),
+            pd.DataFrame({
                 "geo_id": "66666",
                 "timestamp": datetime(2020, 6, 6),
                 "val": 10,
                 "se": 0.2,
                 "sample_size": pd.NA,
-            },
-            ignore_index=True,
+            }, index = [0])]
         )
 
         create_export_csv(
@@ -283,15 +283,15 @@ def test_export_without_null_removal(self):
         """Test that `remove_null_samples = False` does not remove entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append(
-            {
+        df_with_nulls = pd.concat(
+            [self.DF.copy(),
+            pd.DataFrame({
                 "geo_id": "66666",
                 "timestamp": datetime(2020, 6, 6),
                 "val": 10,
                 "se": 0.2,
                 "sample_size": pd.NA,
-            },
-            ignore_index=True,
+            }, index = [0])]
         )
 
         create_export_csv(

diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py
@@ -196,7 +196,7 @@ def test_load_fips_chngfips_table(self, geomapper):
 
     def test_load_jhu_uid_fips_table(self, geomapper):
         jhu_data = geomapper.get_crosswalk(from_code="jhu_uid", to_code="fips")
-        assert np.allclose(jhu_data.groupby("jhu_uid").sum(), 1.0)
+        assert np.allclose(jhu_data.groupby("jhu_uid").sum(numeric_only=True), 1.0)
 
     def test_load_zip_hrr_table(self, geomapper):
         zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr")

diff --git a/_delphi_utils_python/tests/validator/test_dynamic.py b/_delphi_utils_python/tests/validator/test_dynamic.py
@@ -48,7 +48,7 @@ def test_half_padding(self):
             ref_df, test_df, ref_date, ref_date)
 
         # Check it only takes missing dates - so the last 5 dates
-        assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11",
+        assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-11",
             "%Y-%m-%d").date()
         assert new_ref_df.shape[0] == 11
         assert new_ref_df["val"].iloc[5] == 2
@@ -71,7 +71,7 @@ def test_full_padding(self):
             ref_df, test_df, ref_date, ref_date)
 
         # Check it only takes missing dates up to the day before the reference
-        assert new_ref_df.time_value.max() == datetime.strptime("2021-01-15",
+        assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-15",
             "%Y-%m-%d").date()
         assert new_ref_df.shape[0] == 15
         assert new_ref_df["val"].iloc[5] == 2

diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2
@@ -32,11 +32,14 @@
       "max_age":6,
       "maintainers": ["U01AP8GSWG3","U01069KCRS7"],
       "retired-signals": [
-        "raw_pct_negative","smoothed_pct_negative","raw_tests_per_device","smoothed_tests_per_device",
-        ["covid_ag_raw_pct_positive_age_0_4", "hrr"], ["covid_ag_raw_pct_positive_age_0_4", "msa"],
-        ["covid_ag_raw_pct_positive_age_5_17", "hrr"], ["covid_ag_raw_pct_positive_age_5_17", "msa"],
-        ["covid_ag_raw_pct_positive_age_50_64", "hrr"], ["covid_ag_raw_pct_positive_age_50_64", "msa"],
-        ["covid_ag_raw_pct_positive_age_65plus", "hrr"], ["covid_ag_raw_pct_positive_age_65plus", "msa"]
+        "raw_pct_negative", "smoothed_pct_negative",
+        "raw_tests_per_device", "smoothed_tests_per_device",
+        "covid_ag_raw_pct_positive_age_0_4", "covid_ag_smoothed_pct_positive_age_0_4",
+        "covid_ag_raw_pct_positive_age_5_17", "covid_ag_smoothed_pct_positive_age_5_17",
+        "covid_ag_raw_pct_positive_age_18_49", "covid_ag_smoothed_pct_positive_age_18_49",
+        "covid_ag_raw_pct_positive_age_50_64", "covid_ag_smoothed_pct_positive_age_50_64",
+        "covid_ag_raw_pct_positive_age_65plus", "covid_ag_smoothed_pct_positive_age_65plus",
+        "covid_ag_raw_pct_positive_age_0_17", "covid_ag_smoothed_pct_positive_age_0_17"
       ]
     },
     "nchs-mortality": {

diff --git a/backfill_corrections/Dockerfile b/backfill_corrections/Dockerfile
@@ -26,6 +26,7 @@ RUN install2.r --error \
 
 RUN --mount=type=secret,id=GITHUB_TOKEN \
     export GITHUB_PAT="$(cat /run/secrets/GITHUB_TOKEN)" && \
+    R -e 'devtools::install_version("bettermc", version = "1.1.2")' && \
     R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \
     R -e 'devtools::install_github(repo="ryantibs/quantgen", subdir="quantgen")' && \
     R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)'

diff --git a/backfill_corrections/Makefile b/backfill_corrections/Makefile
@@ -67,6 +67,8 @@ run-local: setup-dirs
 	grep "backfill correction completed successfully" $(LOG_FILE)
 	grep "scheduled core" $(LOG_FILE) ; \
 	[ "$$?" -eq 1 ]
+	grep "SIGBUS" $(LOG_FILE) ; \
+	[ "$$?" -eq 1 ]
 
 gurobi.lic:
 	@echo WLSACCESSID=$(GRB_WLSACCESSID) >> $(GRB_LICENSE_FILE)
@@ -81,6 +83,7 @@ run:
 		-v "`realpath $(USR_CACHE_DIR)`:/backfill_corrections/${CACHE_DIR}" \
 		-v "${PWD}"/params.json:/backfill_corrections/params.host.json \
 		--env GRB_LICENSE_FILE=$(GRB_LICENSE_FILE) \
+		--shm-size=2gb \
 		-it "${DOCKER_IMAGE}:${DOCKER_TAG}" \
 		/bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\" LOG_FILE=${LOG_FILE}"
 
@@ -124,7 +127,7 @@ standardize-dirs:
 	$(PYTHON) -m delphi_utils set export_dir $(EXPORT_DIR)
 
 clean:
-	rm -f $(USR_EXPORT_DIR)/*.csv.gz
+	rm -rf $(USR_EXPORT_DIR)/*
 
 coverage:
 	Rscript -e 'covr::package_coverage("delphiBackfillCorrection")'

diff --git a/backfill_corrections/delphiBackfillCorrection/NAMESPACE b/backfill_corrections/delphiBackfillCorrection/NAMESPACE
@@ -30,6 +30,7 @@ importFrom(dplyr,arrange)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,desc)
+importFrom(dplyr,distinct)
 importFrom(dplyr,everything)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)

diff --git a/backfill_corrections/delphiBackfillCorrection/R/main.R b/backfill_corrections/delphiBackfillCorrection/R/main.R
@@ -18,8 +18,6 @@
 run_backfill <- function(df, params,
                          refd_col = "time_value", lag_col = "lag", issued_col = "issue_date",
                          signal_suffixes = c(""), indicator = "", signal = "") {
-  df <- filter(df, lag < params$ref_lag + 30) # a rough filtration to save memory
-
   geo_levels <- params$geo_levels
   if ("state" %in% geo_levels) {
     # If state included, do it last since state processing modifies the
@@ -62,6 +60,7 @@ run_backfill <- function(df, params,
     msg_ts("Splitting data into geo groups")
     group_dfs <- group_split(df, geo_value)
 
+    msg_ts("Beginning training and/or testing...")
     # Build model for each location
     apply_fn <- ifelse(params$parallel, mclapply, lapply)
     result <- apply_fn(group_dfs, function(subdf) {
@@ -317,14 +316,12 @@ main <- function(params,
 
     msg_ts("Reading in and combining associated files")
     input_data <- lapply(
-      files_list,
-      function(file) {
-        # refd_col and issued_col read in as strings
-        read_data(file) %>%
-          fips_to_geovalue()
-      }
+      files_list, read_data # refd_col and issued_col read in as strings
     ) %>%
-      bind_rows()
+      bind_rows() %>%
+      fips_to_geovalue() %>%
+       # a rough filter to save memory
+      filter(lag < params$ref_lag + 30)
 
     if (nrow(input_data) == 0) {
       warning("No data available for indicator ", input_group$indicator,

diff --git a/backfill_corrections/delphiBackfillCorrection/R/utils.R b/backfill_corrections/delphiBackfillCorrection/R/utils.R
@@ -169,6 +169,8 @@ create_dir_not_exist <- function(path)
 #' @return list of input dataframe augmented with lag column, if it
 #'     didn't already exist, and character vector of one or two value
 #'     column names, depending on requested `value_type`
+#'
+#' @importFrom dplyr distinct across
 validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes,
                             refd_col = "time_value", lag_col = "lag", issued_col = "issue_date") {
   if (!missing(signal_suffixes) && !is.na(signal_suffixes) && !all(signal_suffixes == "") && !all(is.na(signal_suffixes))) {
@@ -205,13 +207,16 @@ validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes
   }
 
   # Drop duplicate rows.
-  duplicate_i <- duplicated(df)
-  if (any(duplicate_i)) {
+  raw_df_rows <- nrow(df)
+  df <- distinct(df)
+  new_df_rows <- nrow(df)
+  if (raw_df_rows != new_df_rows) {
     warning("Data contains duplicate rows, dropping")
-    df <- df[!duplicate_i,]
   }
 
-  if (anyDuplicated(df[, c(refd_col, issued_col, "geo_value", "state_id")])) {
+  if (new_df_rows != nrow(
+      distinct(df, across(c(refd_col, issued_col, "geo_value", "state_id")))
+    )) {
     stop("Data contains multiple entries with differing values for at",
          " least one reference date-issue date-location combination")
   }

diff --git a/changehc/delphi_changehc/load_data.py b/changehc/delphi_changehc/load_data.py
@@ -71,7 +71,7 @@ def load_chng_data(filepath, dropdate, base_geo,
     ), "Counts must be nonnegative"
 
     # aggregate age groups (so data is unique by date and base geography)
-    data = data.groupby([base_geo, Config.DATE_COL]).sum()
+    data = data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
     data.dropna(inplace=True)  # drop rows with any missing entries
 
     return data

diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -91,7 +91,7 @@ def test_geo_reindex(self):
                 "timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
             data_frame = su_inst.geo_reindex(test_data)
             assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
-            assert (data_frame.sum() == (4200,19000)).all()
+            assert (data_frame.sum(numeric_only=True) == (4200,19000)).all()
 
     def test_update_sensor(self):
         """Tests that the sensors are properly updated."""

diff --git a/changehc/version.cfg b/changehc/version.cfg
@@ -1 +1 @@
-current_version = 0.3.35
+current_version = 0.3.36
diff --git a/claims_hosp/delphi_claims_hosp/load_data.py b/claims_hosp/delphi_claims_hosp/load_data.py
@@ -47,7 +47,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo):
     ), "Claims counts must be nonnegative"
 
     # aggregate age groups (so data is unique by date and base geography)
-    claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum()
+    claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
     claims_data.dropna(inplace=True)  # drop rows with any missing entries
 
     return claims_data

diff --git a/claims_hosp/version.cfg b/claims_hosp/version.cfg
@@ -1 +1 @@
-current_version = 0.3.35
+current_version = 0.3.36
diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py
@@ -49,7 +49,7 @@ def county_to_msa(self, data):
                                      from_col="PatCountyFIPS",
                                      new_col="cbsa_id")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "cbsa_id"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("cbsa_id"), "cbsa_id"
 
@@ -66,7 +66,7 @@ def county_to_state(self, data):
                                      "state_id",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "state_id"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("state_id"), "state_id"
 
@@ -83,7 +83,7 @@ def county_to_hhs(self, data):
                                      "hhs",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "hhs"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("hhs"), "hhs"
 
@@ -100,7 +100,7 @@ def county_to_nation(self, data):
                                      "nation",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "nation"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "nation"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("nation"), "nation"
 

diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py
@@ -60,16 +60,17 @@ def fill_dates(y_data, dates):
         last_date = dates[-1]
         cols = y_data.columns
 
+        df_list = [y_data]
         if first_date not in y_data.index:
-            y_data = y_data.append(
+            df_list.append(
                 pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[first_date])
             )
         if last_date not in y_data.index:
-            y_data = y_data.append(
+            df_list.append(
                 pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[last_date])
             )
 
-        y_data.sort_index(inplace=True)
+        y_data = pd.concat(df_list).sort_index()
         y_data = y_data.asfreq("D", fill_value=0)
         return y_data
 

diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py
@@ -101,7 +101,7 @@ def update_sensor(
     data.dropna(inplace=True)  # drop rows with any missing entries
 
     # aggregate age groups (so data is unique by service date and FIPS)
-    data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum().reset_index()
+    data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
     assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"
     assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
 

diff --git a/doctor_visits/version.cfg b/doctor_visits/version.cfg
@@ -1 +1 @@
-current_version = 0.3.35
+current_version = 0.3.36
diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py
@@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper):
         ).groupby(
             geo
         ).sum(
+            numeric_only=True
         ).reset_index(
         )
         df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner")