Merge pull request #1606 from cmu-delphi/release/indicators_v0.3.12_utils_v0.3.3

korlaxxalrok · web-flow · commit 2a8deb255495 · 2022-05-02T13:50:58.000-04:00
Release covidcast-indicators 0.3.12
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.11
+current_version = 0.3.12
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2
@@ -132,7 +132,9 @@
         "smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has",
         "smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has",
         "smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has",
-        "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has"
+        "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has",
+        ["smoothed_vaccine_barrier_appointment_location_tried", "county", "state"],
+        ["smoothed_vaccine_barrier_other_tried", "county", "state"]
       ]
     },
     "quidel": {
diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py
@@ -617,6 +617,7 @@ def interpolate_missing_values(dfs: DataDict) -> DataDict:
         # https://github.com/cmu-delphi/covidcast-indicators/issues/1576
         _, sig, _ = key
         if sig == "positivity":
+            interpolate_df[key] = df.set_index(["geo_id", "timestamp"]).sort_index().reset_index()
             continue
 
         geo_dfs = []
@@ -628,31 +629,37 @@ def interpolate_missing_values(dfs: DataDict) -> DataDict:
             if "val" in reindexed_group_df.columns and not reindexed_group_df["val"].isna().all():
                 reindexed_group_df["val"] = (
                     reindexed_group_df["val"]
-                    .interpolate(method="linear", limit_area="inside")
                     .astype(float)
+                    .interpolate(method="linear", limit_area="inside")
                 )
             if "se" in reindexed_group_df.columns:
                 reindexed_group_df["se"] = (
                     reindexed_group_df["se"]
-                    .interpolate(method="linear", limit_area="inside")
                     .astype(float)
+                    .interpolate(method="linear", limit_area="inside")
                 )
             if (
                 "sample_size" in reindexed_group_df.columns
                 and not reindexed_group_df["sample_size"].isna().all()
             ):
                 reindexed_group_df["sample_size"] = (
                     reindexed_group_df["sample_size"]
-                    .interpolate(method="linear", limit_area="inside")
                     .astype(float)
+                    .interpolate(method="linear", limit_area="inside")
                 )
             if "publish_date" in reindexed_group_df.columns:
                 reindexed_group_df["publish_date"] = reindexed_group_df["publish_date"].fillna(
                     method="bfill"
                 )
+            reindexed_group_df = reindexed_group_df[~reindexed_group_df.val.isna()]
             geo_dfs.append(reindexed_group_df)
         interpolate_df[key] = (
-            pd.concat(geo_dfs).reset_index().rename(columns={"index": "timestamp"})
+            pd.concat(geo_dfs)
+            .reset_index()
+            .rename(columns={"index": "timestamp"})
+            .set_index(["geo_id", "timestamp"])
+            .sort_index()
+            .reset_index()
         )
     return interpolate_df
 
diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py
@@ -4,7 +4,7 @@
 from itertools import chain
 from typing import Any, Dict, List, Union
 import pandas as pd
-from pandas.util.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal
 import numpy as np
 import pytest
 from unittest.mock import patch, Mock
@@ -506,7 +506,7 @@ def test_interpolation(self):
             "sample_size": [line(i) for i in range(0, 10)],
             "publish_date": pd.to_datetime("2022-01-10")
         }), dtypes=DTYPES)
-        # A signal missing everything, should be left alone.
+        # A signal missing everything, should be dropped since it's all NAs.
         missing_sig3 = sig3[(sig3.timestamp <= "2022-01-05") | (sig3.timestamp >= "2022-01-08")]
 
         sig4 = _set_df_dtypes(pd.DataFrame({
@@ -517,12 +517,33 @@ def test_interpolation(self):
             "sample_size": [line(i) for i in range(0, 10)],
             "publish_date": pd.to_datetime("2022-01-10")
         }), dtypes=DTYPES)
-        # A signal missing everything except for one point, should be left alone.
+        # A signal missing everything except for one point, should output a reduced range without NAs.
         missing_sig4 = sig4[(sig4.timestamp <= "2022-01-05") | (sig4.timestamp >= "2022-01-08")]
 
         missing_dfs = [missing_sig1, missing_sig2, missing_sig3, missing_sig4]
         interpolated_dfs1 = interpolate_missing_values({("src", "sig", False): pd.concat(missing_dfs)})
-        expected_dfs = pd.concat([sig1, sig2, sig3, sig4])
+        expected_dfs = pd.concat([sig1, sig2, sig4.loc[9:]])
+        _assert_frame_equal(interpolated_dfs1[("src", "sig", False)], expected_dfs, index_cols=["geo_id", "timestamp"])
+
+    def test_interpolation_object_type(self):
+        DTYPES = {"geo_id": str, "timestamp": "datetime64[ns]", "val": float, "se": float, "sample_size": float, "publish_date": "datetime64[ns]"}
+        line = lambda x: 3 * x + 5
+
+        sig1 = _set_df_dtypes(pd.DataFrame({
+            "geo_id": "1",
+            "timestamp": pd.date_range("2022-01-01", "2022-01-10"),
+            "val": [line(i) for i in range(2, 12)],
+            "se": [line(i) for i in range(1, 11)],
+            "sample_size": [line(i) for i in range(0, 10)],
+            "publish_date": pd.to_datetime("2022-01-10")
+        }), dtypes=DTYPES)
+        # A linear signal missing two days which should be filled exactly by the linear interpolation.
+        missing_sig1 = sig1[(sig1.timestamp <= "2022-01-05") | (sig1.timestamp >= "2022-01-08")]
+        # set all columns to object type to simulate the miscast we sometimes see when combining dfs
+        missing_sig1 = _set_df_dtypes(missing_sig1, {key: object for key in DTYPES.keys()})
+
+        interpolated_dfs1 = interpolate_missing_values({("src", "sig", False): missing_sig1})
+        expected_dfs = pd.concat([sig1])
         _assert_frame_equal(interpolated_dfs1[("src", "sig", False)], expected_dfs, index_cols=["geo_id", "timestamp"])
 
     @patch("delphi_dsew_community_profile.pull.INTERP_LENGTH", 2)
diff --git a/facebook/contingency-combine.R b/facebook/contingency-combine.R
@@ -26,11 +26,11 @@ suppressPackageStartupMessages({
 #'   create new ones, relative to the current working directory.
 #' @param pattern Regular expression indicating which files in that directory to
 #'   open. By default, selects all `.csv` files with standard table date prefix.
-run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv$") {
+run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv.gz$") {
   if (!dir.exists(output_dir)) { dir.create(output_dir) }
   
   files <- list.files(input_dir, pattern = pattern)
-  if (length(files) == 0) { stop("No matching data files.") }
+  if (length(files) == 0) { stop("No matching contingency files to combine.") }
 
   # Get df of input files and corresponding output files. Reformat as a list
   # such that input files with same grouping variables (and thus same output
diff --git a/facebook/delphiFacebook/R/contingency_write.R b/facebook/delphiFacebook/R/contingency_write.R
@@ -45,6 +45,7 @@ write_contingency_tables <- function(data, params, geo_type, groupby_vars)
     
     file_name <- get_file_name(params, geo_type, groupby_vars)
     msg_df(sprintf("saving contingency table data to %-35s", file_name), data)
+    # Automatically uses gzip compression based on output file name.
     write_csv(data, file.path(params$export_dir, file_name))
 
   } else {
@@ -169,7 +170,8 @@ get_file_name <- function(params, geo_type, groupby_vars) {
   if (!is.null(params$debug) && params$debug) {
     file_name <- paste0("DebugOn-DoNotShare_", file_name)
   }
-  file_name <- paste0(file_name, ".csv")
+  # Always use gzip compression.
+  file_name <- paste0(file_name, ".csv.gz")
   return(file_name)
 }
 
diff --git a/facebook/delphiFacebook/integration-tests/testthat/test-contingency-run.R b/facebook/delphiFacebook/integration-tests/testthat/test-contingency-run.R
@@ -67,10 +67,10 @@ test_that("small dataset produces no output", {
 ### This test relies on `setup-run.R` to run the full pipeline and tests basic
 ### properties of the output.
 test_that("full synthetic dataset produces expected output format", {
-  expected_files <- c("20200501_20200531_monthly_nation_gender.csv")
+  expected_files <- c("20200501_20200531_monthly_nation_gender.csv.gz")
   actual_files <- dir(test_path("receiving_contingency_full"))
   
-  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
+  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
   
   expect_setequal(expected_files, actual_files)
   expect_equal(dir.exists(test_path("receiving_contingency_full")), TRUE)
@@ -101,7 +101,7 @@ test_that("simple equal-weight dataset produces correct percents", {
   run_contingency_tables_many_periods(params, base_aggs[2,])
 
   # Expected files
-  expect_setequal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv"))
+  expect_setequal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv.gz"))
 
   # Expected file contents
   raw_data <- read.csv(test_path("./input/simple_synthetic.csv"))
@@ -112,7 +112,7 @@ test_that("simple equal-weight dataset produces correct percents", {
     "us", "Female", fever_prop * 100, NA, 2000L, 100 * 2000
   ))
   
-  df <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
+  df <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
   expect_equivalent(df, expected_output)
 })
 
@@ -148,7 +148,7 @@ test_that("testing run with multiple aggregations per group", {
     represented_pct_heartdisease = 100 * 2000,
   )
 
-  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
+  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
   expect_equivalent(out, expected)
 })
 
@@ -198,7 +198,7 @@ test_that("simple weighted dataset produces correct percents", {
   run_contingency_tables_many_periods(params, base_aggs[2,])
 
   # Expected files
-  expect_equal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv"))
+  expect_equal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv.gz"))
 
   # Expected file contents
   raw_data <- read.csv(test_path("./input/simple_synthetic.csv"))
@@ -209,7 +209,7 @@ test_that("simple weighted dataset produces correct percents", {
     "us", "Female", fever_prop * 100, NA, 2000L, sum(rand_weights)
   ))
 
-  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
+  out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
   expect_equivalent(out, expected_output)
 })
 
@@ -228,7 +228,7 @@ test_that("production of historical CSVs for range of dates", {
 
   run_contingency_tables_many_periods(params, base_aggs[2,])
   # Expected files
-  expect_equal(!!dir(params$export_dir), c("20200503_20200509_weekly_nation_gender.csv", "20200510_20200516_weekly_nation_gender.csv"))
+  expect_equal(!!dir(params$export_dir), c("20200503_20200509_weekly_nation_gender.csv.gz", "20200510_20200516_weekly_nation_gender.csv.gz"))
 })
 
 
diff --git a/facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R b/facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R
@@ -45,9 +45,9 @@ test_that("testing write_contingency_tables command", {
                                          aggregate_range = "week"), 
                            "state", 
                            c("geo_id", "tested"))
-  expect_setequal(!!dir(tdir), c("20200510_20200516_weekly_state_tested.csv"))
+  expect_setequal(!!dir(tdir), c("20200510_20200516_weekly_state_tested.csv.gz"))
   
-  df <- read_csv(file.path(tdir, "20200510_20200516_weekly_state_tested.csv"))
+  df <- read_csv(file.path(tdir, "20200510_20200516_weekly_state_tested.csv.gz"))
   expect_equivalent(df, test_data)
 })
 
@@ -59,13 +59,13 @@ test_that("testing command to create output filenames", {
     end_date=as.Date("2021-01-02")
   )
   out <- get_file_name(params, "nation", c("gender"))
-  expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv"
+  expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv.gz"
   
   expect_equal(out, expected)
   
   params$debug <- FALSE
   out <- get_file_name(params, "nation", c("gender", "race", "ethnicity"))
-  expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv"
+  expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv.gz"
   
   expect_equal(out, expected)
 })
diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template
@@ -132,7 +132,9 @@
         "smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has",
         "smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has",
         "smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has",
-        "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has"
+        "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has",
+        ["smoothed_vaccine_barrier_appointment_location_tried", "county", "state"],
+        ["smoothed_vaccine_barrier_other_tried", "county", "state"]
       ]
     },
     "quidel": {

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ write_contingency_tables <- function(data, params, geo_type, groupby_vars)`
`45`	`45`
`46`	`46`	`file_name <- get_file_name(params, geo_type, groupby_vars)`
`47`	`47`	`msg_df(sprintf("saving contingency table data to %-35s", file_name), data)`
	`48`	`+ # Automatically uses gzip compression based on output file name.`
`48`	`49`	`write_csv(data, file.path(params$export_dir, file_name))`
`49`	`50`
`50`	`51`	`} else {`
`@@ -169,7 +170,8 @@ get_file_name <- function(params, geo_type, groupby_vars) {`
`169`	`170`	`if (!is.null(params$debug) && params$debug) {`
`170`	`171`	`file_name <- paste0("DebugOn-DoNotShare_", file_name)`
`171`	`172`	`}`
`172`		`- file_name <- paste0(file_name, ".csv")`
	`173`	`+ # Always use gzip compression.`
	`174`	`+ file_name <- paste0(file_name, ".csv.gz")`
`173`	`175`	`return(file_name)`
`174`	`176`	`}`
`175`	`177`