Fix guess_period on datetimes, make it more precise + generic

brookslogan · brookslogan · commit 00804257f334 · 2024-07-17T23:30:38.000-07:00
- Don't discard units and effectively replace them with seconds
- Don't allow any tolerance in judging a remainder to be zero, since when we use
  it to generate the default `ref_time_values` that means we could miss
  reproducing some of the actual input time values.
- Make it into an S3 generic so it can be extended for more time classes.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.7.12
+Version: 0.7.13
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")),
diff --git a/NAMESPACE b/NAMESPACE
@@ -25,6 +25,9 @@ S3method(group_by,grouped_epi_archive)
 S3method(group_by_drop_default,grouped_epi_archive)
 S3method(group_modify,epi_df)
 S3method(groups,grouped_epi_archive)
+S3method(guess_period,Date)
+S3method(guess_period,POSIXt)
+S3method(guess_period,default)
 S3method(key_colnames,data.frame)
 S3method(key_colnames,default)
 S3method(key_colnames,epi_archive)
@@ -64,6 +67,7 @@ export(filter)
 export(group_by)
 export(group_modify)
 export(growth_rate)
+export(guess_period)
 export(is_epi_df)
 export(is_grouped_epi_archive)
 export(key_colnames)
diff --git a/NEWS.md b/NEWS.md
@@ -35,6 +35,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Improved documentation web site landing page's introduction.
 - Fixed documentation referring to old `epi_slide()` interface (#466, thanks
   @XuedaShen!).
+- Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would
+  output a huge number of `ref_time_values` spaced apart by mere seconds.
 
 ## Cleanup
 - Resolved some linting messages in package checks (#468).
diff --git a/R/utils.R b/R/utils.R
@@ -670,28 +670,52 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) {
   vctrs::vec_cast(numeric_gcd, dividends)
 }
 
-#' Use max valid period as guess for `period` of `ref_time_values`
-#'
-#' @param ref_time_values Vector containing time-interval-like or time-like
-#'   data, with at least two distinct values, [`diff`]-able (e.g., a
-#'   `time_value` or `version` column), and should have a sensible result from
-#'   adding `is.numeric` versions of its `diff` result (via `as.integer` if its
-#'   `typeof` is `"integer"`, otherwise via `as.numeric`).
-#' @param ref_time_values_arg Optional, string; name to give `ref_time_values`
-#'   in error messages. Defaults to quoting the expression the caller fed into
-#'   the `ref_time_values` argument.
-#' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)`
-guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) {
-  sorted_distinct_ref_time_values <- sort(unique(ref_time_values))
-  if (length(sorted_distinct_ref_time_values) < 2L) {
-    cli_abort("Not enough distinct values in {.code {ref_time_values_arg}} to guess the period.", ref_time_values_arg)
+#' Use max valid period as guess for `period` of `time_values`
+#'
+#' `r lifecycle::badge("experimental")`
+#'
+#' @param time_values Vector containing time-interval-like or time-point-like
+#'   data, with at least two distinct values.
+#' @param time_values_arg Optional, string; name to give `time_values` in error
+#'   messages. Defaults to quoting the expression the caller fed into the
+#'   `time_values` argument.
+#' @return length-1 vector; `r lifecycle::badge("experimental")` class will
+#'   either be the same class as [`base::diff()`] on such time values, an
+#'   integer, or a double, such that all `time_values` can be exactly obtained
+#'   by adding `k * result` for an integer k, and such that there is no smaller
+#'   `result` that can achieve this.
+#' @export
+guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
+  UseMethod("guess_period")
+}
+
+#' @export
+guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
+  rlang::check_dots_empty()
+  sorted_distinct_time_values <- sort(unique(time_values))
+  if (length(sorted_distinct_time_values) < 2L) {
+    cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.",
+      class = "epiprocess__guess_period__not_enough_times",
+      time_values = time_values
+    )
   }
-  skips <- diff(sorted_distinct_ref_time_values)
-  decayed_skips <-
-    if (typeof(skips) == "integer") {
-      as.integer(skips)
-    } else {
-      as.numeric(skips)
-    }
-  gcd_num(decayed_skips)
+  skips <- diff(sorted_distinct_time_values)
+  # Certain diff results have special classes or attributes; use vctrs to try to
+  # appropriately destructure for gcd_num, then restore to their original class
+  # & attributes.
+  skips_data <- vctrs::vec_data(skips)
+  period_data <- gcd_num(skips_data, rrtol = 0)
+  vctrs::vec_restore(period_data, skips)
+}
+
+# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types:
+
+#' @export
+guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
+  as.numeric(NextMethod(), units = "days")
+}
+
+#' @export
+guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
+  as.numeric(NextMethod(), units = "secs")
 }
diff --git a/man/guess_period.Rd b/man/guess_period.Rd
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
@@ -231,3 +231,53 @@ test_that("as_slide_computation raises errors as expected", {
     class = "epiprocess__as_slide_computation__cant_convert_catchall"
   )
 })
+
+test_that("guess_period works", {
+  # Error cases:
+  expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times")
+  expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times")
+  # Different numeric classes and cases:
+  expect_identical(guess_period(c(1, 8)), 7)
+  expect_identical(guess_period(c(1, 8, 15)), 7)
+  expect_identical(guess_period(c(1L, 8L, 15L)), 7L)
+  expect_identical(guess_period(c(0, 7, 14, 15)), 1)
+  # We currently allow the guessed frequency to no appear in the diffs, but this
+  # might not be a good idea as it likely indicates an issue with the data. If
+  # we drop this behavior we could also drop the gcd algorithm by just checking
+  # the validity of the smallest diff:
+  expect_identical(guess_period(c(0, 2, 5)), 1)
+  expect_identical(guess_period(c(0, 4, 10)), 2)
+  # On Dates:
+  daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day")
+  weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week")
+  expect_identical(
+    daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L),
+    daily_dates
+  )
+  expect_identical(
+    weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L),
+    weekly_dates
+  )
+  # On POSIXcts:
+  daily_posixcts <- as.POSIXct(daily_dates, tz = "ET") + 3600
+  weekly_posixcts <- as.POSIXct(weekly_dates, tz = "ET") + 3600
+  expect_identical(
+    daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L),
+    daily_posixcts
+  )
+  expect_identical(
+    weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L),
+    weekly_posixcts
+  )
+  # On POSIXlts:
+  daily_posixlts <- as.POSIXlt(daily_dates, tz = "ET") + 3600
+  weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "ET") + 3600
+  expect_identical(
+    daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L),
+    daily_posixlts
+  )
+  expect_identical(
+    weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L),
+    weekly_posixlts
+  )
+})