Skip to content

Commit dd94e04

Browse files
authored
Merge pull request #1788 from cmu-delphi/release/indicators_v0.3.31_utils_v0.3.10
2 parents 378f821 + 601485b commit dd94e04

File tree

34 files changed

+613
-118
lines changed

34 files changed

+613
-118
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.30
2+
current_version = 0.3.31
33
commit = True
44
message = chore: bump covidcast-indicators to {new_version}
55
tag = False

.github/workflows/build-container-images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,6 @@ jobs:
4343
else
4444
cd ${{ github.workspace }}/${{ matrix.packages }}
4545
echo "using tag: --${imageTag}--"
46-
docker build -t ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag --file Dockerfile .
46+
DOCKER_BUILDKIT=1 BUILDKIT_PROGRESS=plain docker build --secret id=GITHUB_TOKEN -t ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag --file Dockerfile .
4747
docker push ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag
4848
fi

_delphi_utils_python/.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.9
2+
current_version = 0.3.10
33
commit = True
44
message = chore: bump delphi_utils to {new_version}
55
tag = False

_delphi_utils_python/delphi_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
from .nancodes import Nans
1616
from .weekday import Weekday
1717

18-
__version__ = "0.3.9"
18+
__version__ = "0.3.10"

_delphi_utils_python/delphi_utils/export.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def create_export_csv(
4242
write_empty_days: Optional[bool] = False,
4343
logger: Optional[logging.Logger] = None,
4444
weekly_dates = False,
45+
sort_geos: bool = False
4546
):
4647
"""Export data in the format expected by the Delphi API.
4748
@@ -73,6 +74,9 @@ def create_export_csv(
7374
weekly_dates: Optional[bool]
7475
Whether the output data are weekly or not. If True, will prefix files with
7576
"weekly_YYYYWW" where WW is the epiweek instead of the usual YYYYMMDD for daily files.
77+
sort_geos: bool
78+
If True, the dataframe is sorted by geo before writing. Otherwise, the dataframe is
79+
written as is.
7680
7781
Returns
7882
---------
@@ -122,5 +126,7 @@ def create_export_csv(
122126
if remove_null_samples:
123127
export_df = export_df[export_df["sample_size"].notnull()]
124128
export_df = export_df.round({"val": 7, "se": 7})
129+
if sort_geos:
130+
export_df = export_df.sort_values(by="geo_id")
125131
export_df.to_csv(export_file, index=False, na_rep="NA")
126132
return dates

_delphi_utils_python/delphi_utils/runner.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,12 @@ def run_indicator_pipeline(indicator_fn: Callable[[Params], None],
6666
validator = validator_fn(params)
6767
archiver = archiver_fn(params)
6868

69-
if flash_fn:
70-
t = threading.Timer(timer, flash_fn(params))
71-
t.start()
72-
t.join(timer)
73-
if t.is_alive():
74-
t.cancel()
75-
t.join()
76-
69+
t = threading.Timer(timer, flash_fn, args=[params])
70+
t.start()
71+
t.join(timer)
72+
if t.is_alive():
73+
t.cancel()
74+
t.join()
7775

7876
if validator:
7977
validation_report = validator.validate()

_delphi_utils_python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
setup(
2828
name="delphi_utils",
29-
version="0.3.9",
29+
version="0.3.10",
3030
description="Shared Utility Functions for Indicators",
3131
long_description=long_description,
3232
long_description_content_type="text/markdown",

_delphi_utils_python/tests/test_export.py

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
from datetime import datetime
33
from os import listdir, remove
44
from os.path import join
5+
from typing import Any, Dict, List
56

67
import mock
78
import numpy as np
89
import pandas as pd
10+
from pandas.testing import assert_frame_equal
911

1012
from delphi_utils import create_export_csv, Nans
1113

@@ -27,6 +29,17 @@ def _non_ignored_files_set(directory):
2729
out.add(fname)
2830
return out
2931

32+
def _set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
33+
assert all(isinstance(e, type) or isinstance(e, str) for e in dtypes.values()), (
34+
"Values must be types or Pandas string aliases for types."
35+
)
36+
37+
df = df.copy()
38+
for k, v in dtypes.items():
39+
if k in df.columns:
40+
df[k] = df[k].astype(v)
41+
return df
42+
3043

3144
class TestExport:
3245
"""Tests for exporting CSVs."""
@@ -136,7 +149,7 @@ def test_export_rounding(self):
136149
geo_res="county",
137150
sensor="test",
138151
)
139-
pd.testing.assert_frame_equal(
152+
assert_frame_equal(
140153
pd.read_csv(join(self.TEST_DIR, "20200215_county_deaths_test.csv")),
141154
pd.DataFrame(
142155
{
@@ -316,7 +329,7 @@ def test_export_df_without_missingness(self):
316329
"sample_size": [100, 100],
317330
}
318331
).astype({"geo_id": str, "sample_size": int})
319-
pd.testing.assert_frame_equal(df, expected_df)
332+
assert_frame_equal(df, expected_df)
320333

321334
def test_export_df_with_missingness(self):
322335
_clean_directory(self.TEST_DIR)
@@ -348,7 +361,7 @@ def test_export_df_with_missingness(self):
348361
"missing_sample_size": [Nans.NOT_MISSING] * 2,
349362
}
350363
).astype({"geo_id": str, "sample_size": int})
351-
pd.testing.assert_frame_equal(df, expected_df)
364+
assert_frame_equal(df, expected_df)
352365

353366
@mock.patch("delphi_utils.logger")
354367
def test_export_df_with_contradictory_missingness(self, mock_logger):
@@ -372,3 +385,48 @@ def test_export_df_with_contradictory_missingness(self, mock_logger):
372385
mock_logger.info.assert_called_once_with(
373386
"Filtering contradictory missing code in test_None_2020-02-15."
374387
)
388+
389+
def test_export_sort(self):
390+
_clean_directory(self.TEST_DIR)
391+
392+
unsorted_df = pd.DataFrame({
393+
"geo_id": ["51175", "51093", "51175", "51620"],
394+
"timestamp": [
395+
datetime.strptime(x, "%Y-%m-%d")
396+
for x in ["2020-02-15", "2020-02-15", "2020-03-01", "2020-03-15"]
397+
],
398+
"val": [3.12345678910, 2.1, 2.2, 2.6],
399+
"se": [0.15, 0.22, 0.20, 0.34],
400+
"sample_size": [100, 100, 101, 100],
401+
})
402+
create_export_csv(
403+
unsorted_df,
404+
export_dir=self.TEST_DIR,
405+
geo_res="county",
406+
sensor="test"
407+
)
408+
expected_df = pd.DataFrame({
409+
"geo_id": ["51175", "51093"],
410+
"val": [3.12345678910, 2.1],
411+
"se": [0.15, 0.22],
412+
"sample_size": [100, 100],
413+
})
414+
unsorted_csv = _set_df_dtypes(pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")), dtypes={"geo_id": str})
415+
assert_frame_equal(unsorted_csv, expected_df)
416+
417+
_clean_directory(self.TEST_DIR)
418+
create_export_csv(
419+
unsorted_df,
420+
export_dir=self.TEST_DIR,
421+
geo_res="county",
422+
sensor="test",
423+
sort_geos=True
424+
)
425+
expected_df = pd.DataFrame({
426+
"geo_id": ["51093", "51175"],
427+
"val": [2.1, 3.12345678910],
428+
"se": [0.22, 0.15],
429+
"sample_size": [100, 100],
430+
})
431+
sorted_csv = _set_df_dtypes(pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")), dtypes={"geo_id": str})
432+
assert_frame_equal(sorted_csv,expected_df)

backfill_corrections/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ RUN install2.r --error \
2525
Rglpk \
2626
argparser
2727

28-
RUN R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \
28+
RUN --mount=type=secret,id=GITHUB_TOKEN \
29+
export GITHUB_PAT="$(cat /run/secrets/GITHUB_TOKEN)" && \
30+
R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \
2931
R -e 'devtools::install_github(repo="ryantibs/quantgen", subdir="quantgen")' && \
3032
R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)'
3133

backfill_corrections/Makefile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
SHELL:=/bin/bash
22

33
# Change training options during `make` call via `make <command> OPTIONS="<options>"`
4-
# Allowed OPTIONS flags are `--train_models` and `--make_predictions`
4+
#
5+
# Allowed OPTIONS flags are `--train_models` and `--make_predictions`. The indicator
6+
# can be run on a single indicator at a time with `--indicators <indicator name>`,
7+
# or all indicators either by default (not passing `--indicators` at all) or with
8+
# `--indicators all`.
59
OPTIONS=
610

711
PYTHON:=env/bin/python
@@ -78,14 +82,15 @@ run:
7882
-v "${PWD}"/params.json:/backfill_corrections/params.host.json \
7983
--env GRB_LICENSE_FILE=$(GRB_LICENSE_FILE) \
8084
-it "${DOCKER_IMAGE}:${DOCKER_TAG}" \
81-
/bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\""
85+
/bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\" LOG_FILE=${LOG_FILE}"
8286

8387
publish:
84-
if [ -f $(USR_EXPORT_DIR)/*.csv.gz ]; then \
88+
NUM_FILES=`find $(USR_EXPORT_DIR) -name "*csv.gz" | wc -l`; \
89+
if [[ $$NUM_FILES -gt 0 ]]; then \
8590
aws configure set aws_access_key_id $(AWS_KEY_ID); \
8691
aws configure set aws_secret_access_key $(AWS_SECRET_KEY); \
8792
aws s3 cp $(USR_EXPORT_DIR) $(S3_BUCKET)/ --recursive --exclude "*" --include "*.csv.gz" --acl public-read; \
88-
echo "SUCCESS: published `ls -1 $(USR_EXPORT_DIR)/*.csv.gz | wc -l` files to the S3 bucket" >> $(LOG_FILE); \
93+
echo "SUCCESS: published $${NUM_FILES} files to the S3 bucket" >> $(LOG_FILE); \
8994
else \
9095
echo "No files in $(USR_EXPORT_DIR) to publish" >> $(LOG_FILE); \
9196
fi

backfill_corrections/delphiBackfillCorrection/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ importFrom(arrow,read_parquet)
2525
importFrom(dplyr,"%>%")
2626
importFrom(dplyr,across)
2727
importFrom(dplyr,arrange)
28+
importFrom(dplyr,bind_cols)
2829
importFrom(dplyr,bind_rows)
2930
importFrom(dplyr,desc)
3031
importFrom(dplyr,everything)
@@ -36,6 +37,7 @@ importFrom(dplyr,mutate)
3637
importFrom(dplyr,pull)
3738
importFrom(dplyr,rename)
3839
importFrom(dplyr,select)
40+
importFrom(dplyr,starts_with)
3941
importFrom(dplyr,summarize)
4042
importFrom(dplyr,ungroup)
4143
importFrom(evalcast,weighted_interval_score)

backfill_corrections/delphiBackfillCorrection/R/io.R

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ read_data <- function(input_file) {
1414
#'
1515
#' @template df-template
1616
#'
17-
#' @importFrom dplyr rename %>%
17+
#' @importFrom dplyr rename select
1818
#' @importFrom rlang .data
1919
fips_to_geovalue <- function(df) {
2020
if ( !("geo_value" %in% colnames(df)) ) {
@@ -23,6 +23,9 @@ fips_to_geovalue <- function(df) {
2323
}
2424
df <- rename(df, geo_value = .data$fips)
2525
}
26+
if ( "fips" %in% colnames(df) ) {
27+
df <- select(df, -.data$fips)
28+
}
2629
return(df)
2730
}
2831

@@ -218,3 +221,72 @@ create_name_pattern <- function(indicator, signal,
218221
rollup = str_interp("${indicator}_${signal}_from_[0-9]{8}_to_[0-9]{8}[.]parquet$")
219222
)
220223
}
224+
225+
#' Get date range of data to use for training models
226+
#'
227+
#' Calculate training start and end dates based on user settings.
228+
#' `training_start_date` is the minimum allowed target date when selecting
229+
#' training data to use. `training_end_date` is the maximum allowed target
230+
#' date and maximum allowed issue date.
231+
#'
232+
#' Cases:
233+
#' 1. We are training new models.
234+
#' 2. We are not training new models and cached models exist.
235+
#' 3. We are not training new models and cached models don't exist.
236+
#'
237+
#' Sometimes we want to allow the user to specify an end date in
238+
#' params that overrides the automatically-generated end date. This is
239+
#' only relevant when the user requests to train new models.
240+
#'
241+
#' @template params-template
242+
#'
243+
#' @importFrom stringr str_interp
244+
get_training_date_range <- function(params) {
245+
default_end_date <- TODAY - params$testing_window + 1
246+
247+
if (params$train_models) {
248+
if (params_element_exists_and_valid(params, "training_end_date")) {
249+
# Use user-provided end date.
250+
training_end_date <- as.Date(params$training_end_date)
251+
} else {
252+
# Default end date is today.
253+
training_end_date <- default_end_date
254+
}
255+
} else {
256+
# Get end date from cached model files. Assumes filename format like
257+
# `20220628_20220529_changehc_covid_state_lambda0.1_count_ca_lag5_tau0.9.model`
258+
# where the leading date is the training end date for that model, and the
259+
# second date is the training start date.
260+
model_files <- list.files(params$cache_dir, "^20[0-9]{6}_20[0-9]{6}.*[.]model$")
261+
if (params$indicators != "all") {
262+
# If an single indicator is specified via the command-line
263+
# `--indicators` argument, the training end date from available model
264+
# files for only that indicator will be used. This means that model
265+
# training date ranges may not match across all indicators.
266+
model_files <- list.files(
267+
params$cache_dir,
268+
str_interp("^20[0-9]{6}_20[0-9]{6}_${params$indicators}.*[.]model$")
269+
)
270+
}
271+
if (length(model_files) == 0) {
272+
# We know we'll be retraining models today.
273+
training_end_date <- default_end_date
274+
} else {
275+
# If only some models are in the cache, they will be used and those
276+
# missing will be regenerated as-of the training end date.
277+
training_end_date <- max(as.Date(substr(model_files, 1, 8), "%Y%m%d"))
278+
}
279+
}
280+
281+
# Calculate start date instead of reading from cached files. This assumes
282+
# that the user-provided `params$training_days` is more up-to-date. If
283+
# `params$training_days` has changed such that for a given training end
284+
# date, the calculated training start date differs from the start date
285+
# referenced in cached file names, then those cached files will not be used.
286+
training_start_date <- training_end_date - params$training_days
287+
288+
return(list(
289+
"training_start_date"=training_start_date,
290+
"training_end_date"=training_end_date
291+
))
292+
}

0 commit comments

Comments
 (0)