Skip to content

Commit 2a8deb2

Browse files
authored
Merge pull request #1606 from cmu-delphi/release/indicators_v0.3.12_utils_v0.3.3
Release covidcast-indicators 0.3.12
2 parents e444ed2 + 0005e09 commit 2a8deb2

File tree

9 files changed

+60
-26
lines changed

9 files changed

+60
-26
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.11
2+
current_version = 0.3.12
33
commit = True
44
message = chore: bump covidcast-indicators to {new_version}
55
tag = False

ansible/templates/sir_complainsalot-params-prod.json.j2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@
132132
"smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has",
133133
"smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has",
134134
"smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has",
135-
"smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has"
135+
"smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has",
136+
["smoothed_vaccine_barrier_appointment_location_tried", "county", "state"],
137+
["smoothed_vaccine_barrier_other_tried", "county", "state"]
136138
]
137139
},
138140
"quidel": {

dsew_community_profile/delphi_dsew_community_profile/pull.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,7 @@ def interpolate_missing_values(dfs: DataDict) -> DataDict:
617617
# https://github.com/cmu-delphi/covidcast-indicators/issues/1576
618618
_, sig, _ = key
619619
if sig == "positivity":
620+
interpolate_df[key] = df.set_index(["geo_id", "timestamp"]).sort_index().reset_index()
620621
continue
621622

622623
geo_dfs = []
@@ -628,31 +629,37 @@ def interpolate_missing_values(dfs: DataDict) -> DataDict:
628629
if "val" in reindexed_group_df.columns and not reindexed_group_df["val"].isna().all():
629630
reindexed_group_df["val"] = (
630631
reindexed_group_df["val"]
631-
.interpolate(method="linear", limit_area="inside")
632632
.astype(float)
633+
.interpolate(method="linear", limit_area="inside")
633634
)
634635
if "se" in reindexed_group_df.columns:
635636
reindexed_group_df["se"] = (
636637
reindexed_group_df["se"]
637-
.interpolate(method="linear", limit_area="inside")
638638
.astype(float)
639+
.interpolate(method="linear", limit_area="inside")
639640
)
640641
if (
641642
"sample_size" in reindexed_group_df.columns
642643
and not reindexed_group_df["sample_size"].isna().all()
643644
):
644645
reindexed_group_df["sample_size"] = (
645646
reindexed_group_df["sample_size"]
646-
.interpolate(method="linear", limit_area="inside")
647647
.astype(float)
648+
.interpolate(method="linear", limit_area="inside")
648649
)
649650
if "publish_date" in reindexed_group_df.columns:
650651
reindexed_group_df["publish_date"] = reindexed_group_df["publish_date"].fillna(
651652
method="bfill"
652653
)
654+
reindexed_group_df = reindexed_group_df[~reindexed_group_df.val.isna()]
653655
geo_dfs.append(reindexed_group_df)
654656
interpolate_df[key] = (
655-
pd.concat(geo_dfs).reset_index().rename(columns={"index": "timestamp"})
657+
pd.concat(geo_dfs)
658+
.reset_index()
659+
.rename(columns={"index": "timestamp"})
660+
.set_index(["geo_id", "timestamp"])
661+
.sort_index()
662+
.reset_index()
656663
)
657664
return interpolate_df
658665

dsew_community_profile/tests/test_pull.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from itertools import chain
55
from typing import Any, Dict, List, Union
66
import pandas as pd
7-
from pandas.util.testing import assert_frame_equal
7+
from pandas.testing import assert_frame_equal
88
import numpy as np
99
import pytest
1010
from unittest.mock import patch, Mock
@@ -506,7 +506,7 @@ def test_interpolation(self):
506506
"sample_size": [line(i) for i in range(0, 10)],
507507
"publish_date": pd.to_datetime("2022-01-10")
508508
}), dtypes=DTYPES)
509-
# A signal missing everything, should be left alone.
509+
# A signal missing everything, should be dropped since it's all NAs.
510510
missing_sig3 = sig3[(sig3.timestamp <= "2022-01-05") | (sig3.timestamp >= "2022-01-08")]
511511

512512
sig4 = _set_df_dtypes(pd.DataFrame({
@@ -517,12 +517,33 @@ def test_interpolation(self):
517517
"sample_size": [line(i) for i in range(0, 10)],
518518
"publish_date": pd.to_datetime("2022-01-10")
519519
}), dtypes=DTYPES)
520-
# A signal missing everything except for one point, should be left alone.
520+
# A signal missing everything except for one point, should output a reduced range without NAs.
521521
missing_sig4 = sig4[(sig4.timestamp <= "2022-01-05") | (sig4.timestamp >= "2022-01-08")]
522522

523523
missing_dfs = [missing_sig1, missing_sig2, missing_sig3, missing_sig4]
524524
interpolated_dfs1 = interpolate_missing_values({("src", "sig", False): pd.concat(missing_dfs)})
525-
expected_dfs = pd.concat([sig1, sig2, sig3, sig4])
525+
expected_dfs = pd.concat([sig1, sig2, sig4.loc[9:]])
526+
_assert_frame_equal(interpolated_dfs1[("src", "sig", False)], expected_dfs, index_cols=["geo_id", "timestamp"])
527+
528+
def test_interpolation_object_type(self):
529+
DTYPES = {"geo_id": str, "timestamp": "datetime64[ns]", "val": float, "se": float, "sample_size": float, "publish_date": "datetime64[ns]"}
530+
line = lambda x: 3 * x + 5
531+
532+
sig1 = _set_df_dtypes(pd.DataFrame({
533+
"geo_id": "1",
534+
"timestamp": pd.date_range("2022-01-01", "2022-01-10"),
535+
"val": [line(i) for i in range(2, 12)],
536+
"se": [line(i) for i in range(1, 11)],
537+
"sample_size": [line(i) for i in range(0, 10)],
538+
"publish_date": pd.to_datetime("2022-01-10")
539+
}), dtypes=DTYPES)
540+
# A linear signal missing two days which should be filled exactly by the linear interpolation.
541+
missing_sig1 = sig1[(sig1.timestamp <= "2022-01-05") | (sig1.timestamp >= "2022-01-08")]
542+
# set all columns to object type to simulate the miscast we sometimes see when combining dfs
543+
missing_sig1 = _set_df_dtypes(missing_sig1, {key: object for key in DTYPES.keys()})
544+
545+
interpolated_dfs1 = interpolate_missing_values({("src", "sig", False): missing_sig1})
546+
expected_dfs = pd.concat([sig1])
526547
_assert_frame_equal(interpolated_dfs1[("src", "sig", False)], expected_dfs, index_cols=["geo_id", "timestamp"])
527548

528549
@patch("delphi_dsew_community_profile.pull.INTERP_LENGTH", 2)

facebook/contingency-combine.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ suppressPackageStartupMessages({
2626
#' create new ones, relative to the current working directory.
2727
#' @param pattern Regular expression indicating which files in that directory to
2828
#' open. By default, selects all `.csv` files with standard table date prefix.
29-
run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv$") {
29+
run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv.gz$") {
3030
if (!dir.exists(output_dir)) { dir.create(output_dir) }
3131

3232
files <- list.files(input_dir, pattern = pattern)
33-
if (length(files) == 0) { stop("No matching data files.") }
33+
if (length(files) == 0) { stop("No matching contingency files to combine.") }
3434

3535
# Get df of input files and corresponding output files. Reformat as a list
3636
# such that input files with same grouping variables (and thus same output

facebook/delphiFacebook/R/contingency_write.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ write_contingency_tables <- function(data, params, geo_type, groupby_vars)
4545

4646
file_name <- get_file_name(params, geo_type, groupby_vars)
4747
msg_df(sprintf("saving contingency table data to %-35s", file_name), data)
48+
# Automatically uses gzip compression based on output file name.
4849
write_csv(data, file.path(params$export_dir, file_name))
4950

5051
} else {
@@ -169,7 +170,8 @@ get_file_name <- function(params, geo_type, groupby_vars) {
169170
if (!is.null(params$debug) && params$debug) {
170171
file_name <- paste0("DebugOn-DoNotShare_", file_name)
171172
}
172-
file_name <- paste0(file_name, ".csv")
173+
# Always use gzip compression.
174+
file_name <- paste0(file_name, ".csv.gz")
173175
return(file_name)
174176
}
175177

facebook/delphiFacebook/integration-tests/testthat/test-contingency-run.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ test_that("small dataset produces no output", {
6767
### This test relies on `setup-run.R` to run the full pipeline and tests basic
6868
### properties of the output.
6969
test_that("full synthetic dataset produces expected output format", {
70-
expected_files <- c("20200501_20200531_monthly_nation_gender.csv")
70+
expected_files <- c("20200501_20200531_monthly_nation_gender.csv.gz")
7171
actual_files <- dir(test_path("receiving_contingency_full"))
7272

73-
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
73+
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
7474

7575
expect_setequal(expected_files, actual_files)
7676
expect_equal(dir.exists(test_path("receiving_contingency_full")), TRUE)
@@ -101,7 +101,7 @@ test_that("simple equal-weight dataset produces correct percents", {
101101
run_contingency_tables_many_periods(params, base_aggs[2,])
102102

103103
# Expected files
104-
expect_setequal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv"))
104+
expect_setequal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv.gz"))
105105

106106
# Expected file contents
107107
raw_data <- read.csv(test_path("./input/simple_synthetic.csv"))
@@ -112,7 +112,7 @@ test_that("simple equal-weight dataset produces correct percents", {
112112
"us", "Female", fever_prop * 100, NA, 2000L, 100 * 2000
113113
))
114114

115-
df <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
115+
df <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
116116
expect_equivalent(df, expected_output)
117117
})
118118

@@ -148,7 +148,7 @@ test_that("testing run with multiple aggregations per group", {
148148
represented_pct_heartdisease = 100 * 2000,
149149
)
150150

151-
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
151+
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
152152
expect_equivalent(out, expected)
153153
})
154154

@@ -198,7 +198,7 @@ test_that("simple weighted dataset produces correct percents", {
198198
run_contingency_tables_many_periods(params, base_aggs[2,])
199199

200200
# Expected files
201-
expect_equal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv"))
201+
expect_equal(!!dir(params$export_dir), c("20200501_20200531_monthly_nation_gender.csv.gz"))
202202

203203
# Expected file contents
204204
raw_data <- read.csv(test_path("./input/simple_synthetic.csv"))
@@ -209,7 +209,7 @@ test_that("simple weighted dataset produces correct percents", {
209209
"us", "Female", fever_prop * 100, NA, 2000L, sum(rand_weights)
210210
))
211211

212-
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv"))
212+
out <- read.csv(file.path(params$export_dir, "20200501_20200531_monthly_nation_gender.csv.gz"))
213213
expect_equivalent(out, expected_output)
214214
})
215215

@@ -228,7 +228,7 @@ test_that("production of historical CSVs for range of dates", {
228228

229229
run_contingency_tables_many_periods(params, base_aggs[2,])
230230
# Expected files
231-
expect_equal(!!dir(params$export_dir), c("20200503_20200509_weekly_nation_gender.csv", "20200510_20200516_weekly_nation_gender.csv"))
231+
expect_equal(!!dir(params$export_dir), c("20200503_20200509_weekly_nation_gender.csv.gz", "20200510_20200516_weekly_nation_gender.csv.gz"))
232232
})
233233

234234

facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ test_that("testing write_contingency_tables command", {
4545
aggregate_range = "week"),
4646
"state",
4747
c("geo_id", "tested"))
48-
expect_setequal(!!dir(tdir), c("20200510_20200516_weekly_state_tested.csv"))
48+
expect_setequal(!!dir(tdir), c("20200510_20200516_weekly_state_tested.csv.gz"))
4949

50-
df <- read_csv(file.path(tdir, "20200510_20200516_weekly_state_tested.csv"))
50+
df <- read_csv(file.path(tdir, "20200510_20200516_weekly_state_tested.csv.gz"))
5151
expect_equivalent(df, test_data)
5252
})
5353

@@ -59,13 +59,13 @@ test_that("testing command to create output filenames", {
5959
end_date=as.Date("2021-01-02")
6060
)
6161
out <- get_file_name(params, "nation", c("gender"))
62-
expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv"
62+
expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv.gz"
6363

6464
expect_equal(out, expected)
6565

6666
params$debug <- FALSE
6767
out <- get_file_name(params, "nation", c("gender", "race", "ethnicity"))
68-
expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv"
68+
expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv.gz"
6969

7070
expect_equal(out, expected)
7171
})

sir_complainsalot/params.json.template

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@
132132
"smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has",
133133
"smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has",
134134
"smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has",
135-
"smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has"
135+
"smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has",
136+
["smoothed_vaccine_barrier_appointment_location_tried", "county", "state"],
137+
["smoothed_vaccine_barrier_other_tried", "county", "state"]
136138
]
137139
},
138140
"quidel": {

0 commit comments

Comments
 (0)