cmu-delphi
diff --git a/‎.bumpversion.cfg
Lines changed: 1 addition & 1 deletion b/‎.bumpversion.cfg
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-container-images.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-container-images.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎_delphi_utils_python/.bumpversion.cfg
Lines changed: 1 addition & 1 deletion b/‎_delphi_utils_python/.bumpversion.cfg
Lines changed: 1 addition & 1 deletion
diff --git a/‎_delphi_utils_python/delphi_utils/__init__.py
Lines changed: 1 addition & 1 deletion b/‎_delphi_utils_python/delphi_utils/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎_delphi_utils_python/delphi_utils/export.py
Lines changed: 6 additions & 0 deletions b/‎_delphi_utils_python/delphi_utils/export.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/runner.py
Lines changed: 6 additions & 8 deletions b/‎_delphi_utils_python/delphi_utils/runner.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎_delphi_utils_python/setup.py
Lines changed: 1 addition & 1 deletion b/‎_delphi_utils_python/setup.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎_delphi_utils_python/tests/test_export.py
Lines changed: 61 additions & 3 deletions b/‎_delphi_utils_python/tests/test_export.py
Lines changed: 61 additions & 3 deletions
diff --git a/‎backfill_corrections/Dockerfile
Lines changed: 3 additions & 1 deletion b/‎backfill_corrections/Dockerfile
Lines changed: 3 additions & 1 deletion
diff --git a/‎backfill_corrections/Makefile
Lines changed: 9 additions & 4 deletions b/‎backfill_corrections/Makefile
Lines changed: 9 additions & 4 deletions
diff --git a/‎backfill_corrections/delphiBackfillCorrection/NAMESPACE
Lines changed: 2 additions & 0 deletions b/‎backfill_corrections/delphiBackfillCorrection/NAMESPACE
Lines changed: 2 additions & 0 deletions
diff --git a/‎backfill_corrections/delphiBackfillCorrection/R/io.R
Lines changed: 73 additions & 1 deletion b/‎backfill_corrections/delphiBackfillCorrection/R/io.R
Lines changed: 73 additions & 1 deletion
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.30
+current_version = 0.3.31
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
@@ -43,6 +43,6 @@ jobs:
           else
             cd ${{ github.workspace }}/${{ matrix.packages }}
             echo "using tag: --${imageTag}--"
-            docker build -t ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag --file Dockerfile .
+            DOCKER_BUILDKIT=1 BUILDKIT_PROGRESS=plain docker build --secret id=GITHUB_TOKEN -t ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag --file Dockerfile .
             docker push ghcr.io/${{ github.repository }}-${{ matrix.packages }}:$imageTag
           fi
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.9
+current_version = 0.3.10
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False
 
@@ -15,4 +15,4 @@
 from .nancodes import Nans
 from .weekday import Weekday
 
-__version__ = "0.3.9"
+__version__ = "0.3.10"
@@ -42,6 +42,7 @@ def create_export_csv(
     write_empty_days: Optional[bool] = False,
     logger: Optional[logging.Logger] = None,
     weekly_dates = False,
+    sort_geos: bool = False
 ):
     """Export data in the format expected by the Delphi API.
 
@@ -73,6 +74,9 @@ def create_export_csv(
     weekly_dates: Optional[bool]
         Whether the output data are weekly or not. If True, will prefix files with
         "weekly_YYYYWW" where WW is the epiweek instead of the usual YYYYMMDD for daily files.
+    sort_geos: bool
+        If True, the dataframe is sorted by geo before writing. Otherwise, the dataframe is
+        written as is.
 
     Returns
     ---------
@@ -122,5 +126,7 @@ def create_export_csv(
         if remove_null_samples:
             export_df = export_df[export_df["sample_size"].notnull()]
         export_df = export_df.round({"val": 7, "se": 7})
+        if sort_geos:
+            export_df = export_df.sort_values(by="geo_id")
         export_df.to_csv(export_file, index=False, na_rep="NA")
     return dates
@@ -66,14 +66,12 @@ def run_indicator_pipeline(indicator_fn:  Callable[[Params], None],
     validator = validator_fn(params)
     archiver = archiver_fn(params)
 
-    if flash_fn:
-        t = threading.Timer(timer, flash_fn(params))
-        t.start()
-        t.join(timer)
-        if t.is_alive():
-            t.cancel()
-            t.join()
-
+    t = threading.Timer(timer, flash_fn, args=[params])
+    t.start()
+    t.join(timer)
+    if t.is_alive():
+        t.cancel()
+        t.join()
 
     if validator:
         validation_report = validator.validate()
 
@@ -26,7 +26,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.3.9",
+    version="0.3.10",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",
 
@@ -2,10 +2,12 @@
 from datetime import datetime
 from os import listdir, remove
 from os.path import join
+from typing import Any, Dict, List
 
 import mock
 import numpy as np
 import pandas as pd
+from pandas.testing import assert_frame_equal
 
 from delphi_utils import create_export_csv, Nans
 
@@ -27,6 +29,17 @@ def _non_ignored_files_set(directory):
         out.add(fname)
     return out
 
+def _set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
+    assert all(isinstance(e, type) or isinstance(e, str) for e in dtypes.values()), (
+        "Values must be types or Pandas string aliases for types."
+    )
+
+    df = df.copy()
+    for k, v in dtypes.items():
+        if k in df.columns:
+            df[k] = df[k].astype(v)
+    return df
+
 
 class TestExport:
     """Tests for exporting CSVs."""
@@ -136,7 +149,7 @@ def test_export_rounding(self):
             geo_res="county",
             sensor="test",
         )
-        pd.testing.assert_frame_equal(
+        assert_frame_equal(
             pd.read_csv(join(self.TEST_DIR, "20200215_county_deaths_test.csv")),
             pd.DataFrame(
                 {
@@ -316,7 +329,7 @@ def test_export_df_without_missingness(self):
                 "sample_size": [100, 100],
             }
         ).astype({"geo_id": str, "sample_size": int})
-        pd.testing.assert_frame_equal(df, expected_df)
+        assert_frame_equal(df, expected_df)
 
     def test_export_df_with_missingness(self):
         _clean_directory(self.TEST_DIR)
@@ -348,7 +361,7 @@ def test_export_df_with_missingness(self):
                 "missing_sample_size": [Nans.NOT_MISSING] * 2,
             }
         ).astype({"geo_id": str, "sample_size": int})
-        pd.testing.assert_frame_equal(df, expected_df)
+        assert_frame_equal(df, expected_df)
 
     @mock.patch("delphi_utils.logger")
     def test_export_df_with_contradictory_missingness(self, mock_logger):
@@ -372,3 +385,48 @@ def test_export_df_with_contradictory_missingness(self, mock_logger):
         mock_logger.info.assert_called_once_with(
             "Filtering contradictory missing code in test_None_2020-02-15."
         )
+
+    def test_export_sort(self):
+        _clean_directory(self.TEST_DIR)
+
+        unsorted_df = pd.DataFrame({
+            "geo_id": ["51175", "51093", "51175", "51620"],
+            "timestamp": [
+                datetime.strptime(x, "%Y-%m-%d")
+                for x in ["2020-02-15", "2020-02-15", "2020-03-01", "2020-03-15"]
+            ],
+            "val": [3.12345678910, 2.1, 2.2, 2.6],
+            "se": [0.15, 0.22, 0.20, 0.34],
+            "sample_size": [100, 100, 101, 100],
+        })
+        create_export_csv(
+            unsorted_df,
+            export_dir=self.TEST_DIR,
+            geo_res="county",
+            sensor="test"
+        )
+        expected_df = pd.DataFrame({
+            "geo_id": ["51175", "51093"],
+            "val": [3.12345678910, 2.1],
+            "se": [0.15, 0.22],
+            "sample_size": [100, 100],
+        })
+        unsorted_csv = _set_df_dtypes(pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")), dtypes={"geo_id": str})
+        assert_frame_equal(unsorted_csv, expected_df)
+
+        _clean_directory(self.TEST_DIR)
+        create_export_csv(
+            unsorted_df,
+            export_dir=self.TEST_DIR,
+            geo_res="county",
+            sensor="test",
+            sort_geos=True
+        )
+        expected_df = pd.DataFrame({
+            "geo_id": ["51093", "51175"],
+            "val": [2.1, 3.12345678910],
+            "se": [0.22, 0.15],
+            "sample_size": [100, 100],
+        })
+        sorted_csv = _set_df_dtypes(pd.read_csv(join(self.TEST_DIR, "20200215_county_test.csv")), dtypes={"geo_id": str})
+        assert_frame_equal(sorted_csv,expected_df)
@@ -25,7 +25,9 @@ RUN install2.r --error \
     Rglpk \
     argparser
 
-RUN R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \
+RUN --mount=type=secret,id=GITHUB_TOKEN \
+    export GITHUB_PAT="$(cat /run/secrets/GITHUB_TOKEN)" && \
+    R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \
     R -e 'devtools::install_github(repo="ryantibs/quantgen", subdir="quantgen")' && \
     R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)'
 
 
@@ -1,7 +1,11 @@
 SHELL:=/bin/bash
 
 # Change training options during `make` call via `make <command> OPTIONS="<options>"`
-# Allowed OPTIONS flags are `--train_models` and `--make_predictions`
+#
+# Allowed OPTIONS flags are `--train_models` and `--make_predictions`. The indicator
+# can be run on a single indicator at a time with `--indicators <indicator name>`,
+# or all indicators either by default (not passing `--indicators` at all) or with
+# `--indicators all`.
 OPTIONS=
 
 PYTHON:=env/bin/python
@@ -78,14 +82,15 @@ run:
 		-v "${PWD}"/params.json:/backfill_corrections/params.host.json \
 		--env GRB_LICENSE_FILE=$(GRB_LICENSE_FILE) \
 		-it "${DOCKER_IMAGE}:${DOCKER_TAG}" \
-		/bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\""
+		/bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\" LOG_FILE=${LOG_FILE}"
 
 publish:
-	if [ -f $(USR_EXPORT_DIR)/*.csv.gz ]; then \
+	NUM_FILES=`find $(USR_EXPORT_DIR) -name "*csv.gz" | wc -l`; \
+	if [[ $$NUM_FILES -gt 0 ]]; then \
 		aws configure set aws_access_key_id $(AWS_KEY_ID); \
 		aws configure set aws_secret_access_key $(AWS_SECRET_KEY); \
 		aws s3 cp $(USR_EXPORT_DIR) $(S3_BUCKET)/ --recursive --exclude "*" --include "*.csv.gz" --acl public-read; \
-		echo "SUCCESS: published `ls -1 $(USR_EXPORT_DIR)/*.csv.gz | wc -l` files to the S3 bucket" >> $(LOG_FILE); \
+		echo "SUCCESS: published $${NUM_FILES} files to the S3 bucket" >> $(LOG_FILE); \
 	else \
 		echo "No files in $(USR_EXPORT_DIR) to publish" >> $(LOG_FILE); \
 	fi
 
@@ -25,6 +25,7 @@ importFrom(arrow,read_parquet)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
 importFrom(dplyr,arrange)
+importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,desc)
 importFrom(dplyr,everything)
@@ -36,6 +37,7 @@ importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
+importFrom(dplyr,starts_with)
 importFrom(dplyr,summarize)
 importFrom(dplyr,ungroup)
 importFrom(evalcast,weighted_interval_score)
 
@@ -14,7 +14,7 @@ read_data <- function(input_file) {
 #'
 #' @template df-template
 #'
-#' @importFrom dplyr rename %>%
+#' @importFrom dplyr rename select
 #' @importFrom rlang .data
 fips_to_geovalue <- function(df) {
   if ( !("geo_value" %in% colnames(df)) ) {
@@ -23,6 +23,9 @@ fips_to_geovalue <- function(df) {
     }
     df <- rename(df, geo_value = .data$fips)
   }
+  if ( "fips" %in% colnames(df) ) {
+    df <- select(df, -.data$fips)
+  }
   return(df)
 }
 
@@ -218,3 +221,72 @@ create_name_pattern <- function(indicator, signal,
          rollup = str_interp("${indicator}_${signal}_from_[0-9]{8}_to_[0-9]{8}[.]parquet$")
   )
 }
+
+#' Get date range of data to use for training models
+#'
+#' Calculate training start and end dates based on user settings.
+#' `training_start_date` is the minimum allowed target date when selecting
+#' training data to use. `training_end_date` is the maximum allowed target
+#' date and maximum allowed issue date.
+#'
+#' Cases:
+#'   1. We are training new models.
+#'   2. We are not training new models and cached models exist.
+#'   3. We are not training new models and cached models don't exist.
+#'
+#' Sometimes we want to allow the user to specify an end date in
+#' params that overrides the automatically-generated end date. This is
+#' only relevant when the user requests to train new models.
+#'
+#' @template params-template
+#'
+#' @importFrom stringr str_interp
+get_training_date_range <- function(params) {
+  default_end_date <- TODAY - params$testing_window + 1
+
+  if (params$train_models) {
+    if (params_element_exists_and_valid(params, "training_end_date")) {
+      # Use user-provided end date.
+      training_end_date <- as.Date(params$training_end_date)
+    } else {
+      # Default end date is today.
+      training_end_date <- default_end_date
+    }
+  } else {
+    # Get end date from cached model files. Assumes filename format like
+    # `20220628_20220529_changehc_covid_state_lambda0.1_count_ca_lag5_tau0.9.model`
+    # where the leading date is the training end date for that model, and the
+    # second date is the training start date.
+    model_files <- list.files(params$cache_dir, "^20[0-9]{6}_20[0-9]{6}.*[.]model$")
+    if (params$indicators != "all") {
+      # If an single indicator is specified via the command-line
+      # `--indicators` argument, the training end date from available model
+      # files for only that indicator will be used. This means that model
+      # training date ranges may not match across all indicators.
+      model_files <- list.files(
+        params$cache_dir,
+        str_interp("^20[0-9]{6}_20[0-9]{6}_${params$indicators}.*[.]model$")
+      )
+    }
+    if (length(model_files) == 0) {
+      # We know we'll be retraining models today.
+      training_end_date <- default_end_date
+    } else {
+      # If only some models are in the cache, they will be used and those
+      # missing will be regenerated as-of the training end date.
+      training_end_date <- max(as.Date(substr(model_files, 1, 8), "%Y%m%d"))
+    }
+  }
+
+  # Calculate start date instead of reading from cached files. This assumes
+  # that the user-provided `params$training_days` is more up-to-date. If
+  # `params$training_days` has changed such that for a given training end
+  # date, the calculated training start date differs from the start date
+  # referenced in cached file names, then those cached files will not be used.
+  training_start_date <- training_end_date - params$training_days
+
+  return(list(
+    "training_start_date"=training_start_date,
+    "training_end_date"=training_end_date
+  ))
+}