Skip to content

add caching #153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b3f7e81
feat: basic cache; it loads! needs tests and docs
dsweber2 Aug 24, 2023
11c01ed
fix: logic rewrite and bug fix for the cache
dsweber2 Aug 30, 2023
679ce1a
docs+feat: cache checks if dir doesn't exist
dsweber2 Aug 25, 2023
8c3081e
fix: don't include git stuff...
dsweber2 Aug 25, 2023
dd5c676
docs: update documentation (GHA)
dsweber2 Aug 30, 2023
e91dea7
feat+docs: conditional cache, global save dir
dsweber2 Aug 31, 2023
2504966
docs: update documentation (GHA)
dsweber2 Aug 31, 2023
eae1094
feat: warn when caching values from the past week
dsweber2 Aug 31, 2023
0e791a1
fix: catches by Dmitry
dsweber2 Aug 31, 2023
58c8a36
docs: update documentation (GHA)
dsweber2 Aug 31, 2023
663971a
feat: separate logic functs, script cache creation
dsweber2 Sep 1, 2023
ccf44d1
docs+test: cache docs and tests
dsweber2 Sep 1, 2023
84e4035
Update documentation
dsweber2 Sep 1, 2023
2eb5301
fix: collate avail_endpoints
dsweber2 Sep 1, 2023
f6705a0
fix: actually working tests and build locally
dsweber2 Sep 1, 2023
c4408b3
fix: actually import cachem and openssl
dsweber2 Sep 1, 2023
f7a46b3
fix: styler, build, test, check all happy
dsweber2 Sep 5, 2023
3fc9334
docs: document (GHA)
dsweber2 Sep 5, 2023
6c0e8fa
test: rlang needed for testing
dsweber2 Sep 5, 2023
5e17a36
fix: happy linter
dsweber2 Sep 5, 2023
b37ab31
lint: minor newline for readability
dshemetov Sep 6, 2023
b814dfd
fix: bugs CI can't catch b/c readline, extra tests
dsweber2 Sep 6, 2023
b93b97c
lint: rewrap long doc strings
dshemetov Sep 6, 2023
e6ca8f2
docs: document (GHA)
dshemetov Sep 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,18 @@ Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
Imports:
cachem,
checkmate,
cli,
httr,
glue,
jsonlite,
magrittr,
MMWRweek,
purrr,
openssl,
readr,
rlang,
tibble,
xml2
RoxygenNote: 7.2.3
Expand All @@ -45,3 +49,17 @@ Suggests:
VignetteBuilder: knitr
Language: en-US
Config/testthat/edition: 3
Collate:
'auth.R'
'avail_endpoints.R'
'cache.R'
'check.R'
'constants.R'
'covidcast.R'
'endpoints.R'
'epidatacall.R'
'epidatr-package.R'
'model.R'
'request.R'
'utils-pipe.R'
'utils.R'
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ S3method(print,covidcast_epidata)
S3method(print,epidata_call)
export("%>%")
export(avail_endpoints)
export(cache_info)
export(clear_cache)
export(covid_hosp_facility)
export(covid_hosp_facility_lookup)
export(covid_hosp_state_timeseries)
Expand All @@ -16,6 +18,7 @@ export(covidcast_epidata)
export(covidcast_meta)
export(delphi)
export(dengue_nowcast)
export(disable_cache)
export(ecdc_ili)
export(epirange)
export(fetch)
Expand All @@ -40,7 +43,11 @@ export(pvt_norostat)
export(pvt_quidel)
export(pvt_sensors)
export(pvt_twitter)
export(set_cache)
export(wiki)
import(cachem)
import(glue)
import(openssl)
importFrom(MMWRweek,MMWRweek2Date)
importFrom(checkmate,assert)
importFrom(checkmate,assert_character)
Expand Down Expand Up @@ -72,6 +79,7 @@ importFrom(readr,read_csv)
importFrom(tibble,as_tibble)
importFrom(tibble,tibble)
importFrom(utils,help.search)
importFrom(utils,sessionInfo)
importFrom(xml2,read_html)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_text)
298 changes: 298 additions & 0 deletions R/cache.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
# IMPORTANT DEV NOTE: make sure to @include cache.R in the Roxygen docs of any
# function referencing this environment, so this file is loaded first
cache_environ <- new.env(parent = emptyenv())
cache_environ$use_cache <- NULL
cache_environ$epidatr_cache <- NULL
#' create or renew a cache for this session
#' @aliases set_cache
#' @description
#' By default, epidatr re-requests data from the API on every call of `fetch`.
#' In case you find yourself repeatedly calling the same data, you can enable
#' the cache using either this function for a given session, or environmental
#' variables for a persistent cache.
#' The typical recommended workflow for using the cache is to set the
#' environmental variables `EPIDATR_USE_CACHE=TRUE` and
#' `EPIDATR_CACHE_DIRECTORY="/your/directory/here"`in your `.Renviron`, for
#' example by calling `usethis::edit_r_environ()`.
#' See the parameters below for some more configurables if you're so inclined.
#'
#' `set_cache` (re)defines the cache to use in a particular R session. This does
#' not clear existing data at any previous location, but instead creates a
#' handle to the new cache using [cachem](https://cachem.r-lib.org/index.html)
#' that seamlessly handles caching for you.
#' Say your cache is normally stored in some default directory, but for the
#' current session you want to save your results in
#' `~/my/temporary/savedirectory`, then you would call `set_cache(dir =
#' "~/my/temporary/savedirectory")`.
#' Or if you know the data from 2 days ago is wrong, you could call
#' `set_cache(days = 1)` to clear older data whenever the cache is referenced.
#' In both cases, these changes would only last for a single session (though the
#' deleted data would be gone permanently!).
#'
#' An important feature of the caching in this package is that only calls which
#' specify either `issues` before a certain date, or `as_of` before a certain
#' date will actually cache. For example the call
#' ```
#' covidcast(
#' source = "jhu-csse",
#' signals = "confirmed_7dav_incidence_prop",
#' geo_type = "state",
#' time_type = "day",
#' geo_values = "ca,fl",
#' time_values = epirange(20200601, 20230801)
#' )
#' ```
#' *won't* cache, since it is possible for the cache to be invalidated by new
#' releases with no warning. On the other hand, the call
#' ```
#' covidcast(
#' source = "jhu-csse",
#' signals = "confirmed_7dav_incidence_prop",
#' geo_type = "state",
#' time_type = "day",
#' geo_values = "ca,fl",
#' time_values = epirange(20200601, 20230801),
#' as_of = "2023-08-01"
#' )
#' ```
#' *will* cache, since normal new versions of data can't invalidate it (since
#' they would be `as_of` a later date). It is still possible that Delphi may
#' patch such data, but the frequency is on the order of months rather than
#' days. We are working on creating a public channel to communicate such
#' updates. While specifying `issues` will usually cache, a call with
#' `issues="*"` won't cache, since its subject to cache invalidation by normal
#' versioning.
#'
#' On the backend, the cache uses cachem, with filenames generated using an md5
#' encoding of the call url. Each file corresponds to a unique epidata-API
#' call.
#' @examples
#' \dontrun{
#' set_cache(
#' dir = "some/subdir",
#' days = 14,
#' max_size = 512,
#' logfile = "some/subdir/logs.txt",
#' prune_rate = 20L
#' )
#' }
#'
#' @param cache_dir the directory in which the cache is stored. By default, this
#' is `tools::R_user_dir()` if on R 4.0+, but must be specified for earlier
#' versions of R. The path can be either relative or absolute. The
#' environmental variable is `EPIDATR_CACHE_DIR`.
#' @param days the maximum length of time in days to keep any particular cached
#' call. By default this is `1`. The environmental variable is
#' `EPIDATR_CACHE_MAX_AGE_DAYS`.
#' @param max_size the size of the entire cache, in MB, at which to start
#' pruning entries. By default this is `1024`, or 1GB. The environmental
#' variable is `EPIDATR_CACHE_MAX_SIZE_MB`.
#' @param logfile where cachem's log of transactions is stored, relative to the
#' cache directory. By default, it is `"logfile.txt"`. The environmental
#' variable is `EPIDATR_CACHE_LOGFILE`.
#' @param prune_rate how many calls to go between checking if any cache elements
#' are too old or if the cache overall is too large. Defaults to `2000L`.
#' Since cachem fixes the max time between prune checks to 5 seconds, there's
#' little reason to actually change this parameter. Doesn't have a
#' corresponding environmental variable.
#' @param confirm whether to confirm directory creation. default is `TRUE`;
#' should only be set in non-interactive scripts
#' @seealso [clear_cache] to delete the old cache while making a new one,
#' [disable_cache] to disable without deleting, and [cache_info]
#' @export
#' @import cachem
#' @import glue
#' @importFrom utils sessionInfo
set_cache <- function(cache_dir = NULL,
days = NULL,
max_size = NULL,
logfile = NULL,
prune_rate = 2000L,
confirm = TRUE) {
if (is.null(cache_dir) && sessionInfo()$R.version$major >= 4) {
cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR", unset = tools::R_user_dir("epidatr"))
} else if (is.null(cache_dir)) {
# earlier version, so no tools
cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR")
if (cache_dir == "") {
cli::cli_abort("no valid EPIDATR_CACHE_DIR", class = "epidatr_cache_error")
}
}
stopifnot(is.character(cache_dir))
if (is.null(days)) {
days <- Sys.getenv("EPIDATR_CACHE_MAX_AGE_DAYS", unset = 1) %>% as.numeric()
}
if (is.null(max_size)) {
max_size <- Sys.getenv("EPIDATR_CACHE_MAX_SIZE_MB", unset = 1024) %>% as.numeric()
}
if (is.null(logfile)) {
logfile <- Sys.getenv("EPIDATR_CACHE_LOGFILE", unset = "logfile.txt")
}
stopifnot(is.character(logfile))
stopifnot(is.numeric(days), is.numeric(max_size), is.integer(prune_rate))
#
# make sure that that directory exists and drag the user into that process
cache_exists <- file.exists(cache_dir)
cache_usable <- file.access(cache_dir, mode = 6) == 0
if (!(cache_exists)) {
if (confirm) {
user_input <- readline(glue::glue(
"there is no directory at {cache_dir}; the cache will be turned off until a ",
"viable directory has been set. Create one? (yes|no(default)) "
))
repeat {
valid_user_input <- ifelse(grepl("yes|no", user_input), sub(".*(yes|no).*", "\\1", user_input), NA)
if (user_input == "") {
valid_user_input <- ""
}
if (!is.na(valid_user_input)) {
break
}
user_input <- readline(glue::glue(" please answer either yes or no: "))
}
} else {
valid_user_input <- "yes"
}
if (valid_user_input == "yes") {
dir.create(cache_dir, showWarnings = TRUE, recursive = TRUE)
cache_exists <- TRUE
cache_usable <- file.access(cache_dir, mode = 6) == 0
}
}


if (!cache_usable) {
print(glue::glue(
"The directory at {cache_dir} is not accessible; check permissions and/or use a different ",
"directory for the cache (see the `set_cache` documentation)."
))
} else if (cache_exists) {
cache_environ$epidatr_cache <- cachem::cache_disk(
dir = cache_dir,
max_size = as.integer(max_size * 1024^2),
max_age = days * 24 * 60 * 60,
logfile = file.path(cache_dir, logfile),
prune_rate = prune_rate
)
}
}

#' manually reset the cache, deleting all currently saved data and starting afresh
#' @description
#' deletes the current cache and resets a new cache. Deletes local data! If you
#' are using a session unique cache, you will have to pass the arguments you
#' used for `set_cache` earlier, otherwise the system-wide `.Renviron`-based
#' defaults will be used.
#' @examples
#' \dontrun{
#' clear_cache(
#' dir = "some/subdir",
#' days = 14,
#' max_size = 512,
#' logfile = "some/subdir/logs.txt",
#' prune_rate = 20L
#' )
#' }
#' @param disable instead of setting a new cache, disable caching entirely;
#' defaults to `FALSE`
#' @param ... see the `set_cache` arguments below
#' @inheritParams set_cache
#' @seealso [set_cache] to start a new cache (and general caching info),
#' [disable_cache] to only disable without deleting, and [cache_info]
#' @export
#' @import cachem
clear_cache <- function(disable = FALSE, ...) {
cache_environ$epidatr_cache$destroy()
if (!disable) {
set_cache(...)
} else {
cache_environ$epidatr_cache <- NULL
}
}

#' turn off the caching for this session
#' @description
#' Disable caching until you call `set_cache` or restart R. The files defining
#' the cache are untouched. If you are looking to disable the caching more
#' permanently, set `EPIDATR_USE_CACHE=FALSE` as environmental variable in
#' your `.Renviron`.
#' @export
#' @seealso [set_cache] to start a new cache (and general caching info),
#' [clear_cache] to delete the cache and set a new one, and [cache_info]
#' @import cachem
disable_cache <- function() {
cache_environ$epidatr_cache <- NULL
}

#' describe current cache
#' @description
#' Print out the information about the cache (as would be returned by cachem's
#' `info()` method)
#' @seealso [set_cache] to start a new cache (and general caching info),
#' [clear_cache] to delete the cache and set a new one, and [disable_cache] to
#' disable without deleting
#' @export
cache_info <- function() {
if (is.null(cache_environ$epidatr_cache)) {
return("there is no cache")
} else {
return(cache_environ$epidatr_cache$info())
}
}

#' dispatch caching
#'
#' @description
#' the guts of caching, its interposed between fetch and the specific fetch
#' methods. Internal method only.
#'
#' @param call the `epidata_call` object
#' @param fetch_args the args list for fetch as generated by [fetch_args_list()]
#' @keywords internal
#' @import cachem openssl
cache_epidata_call <- function(epidata_call, fetch_args = fetch_args_list()) {
is_cachable <- check_is_cachable(epidata_call, fetch_args)
if (is_cachable) {
target <- request_url(epidata_call)
hashed <- md5(target)
cached <- cache_environ$epidatr_cache$get(hashed)
as_of_recent <- check_is_recent(epidata_call$params$as_of, 7)
issues_recent <- check_is_recent(epidata_call$params$issues, 7)
if (as_of_recent || issues_recent) {
cli::cli_warn("using cached results with `as_of` within the past week (or the future!). This will likely result ",
"in an invalid cache. Consider\n",
"1. disabling the cache for this session with `disable_cache` or permanently with environmental ",
"variable `EPIDATR_USE_CACHE=FALSE`\n",
"2. setting `EPIDATR_CACHE_MAX_AGE_DAYS={Sys.getenv('EPIDATR_CACHE_MAX_AGE_DAYS', unset = 1)}` to e.g. `3/24` ",
"(3 hours).",
.frequency = "regularly",
.frequency_id = "cache timing issues",
class = "cache_recent_data"
)
}
if (!is.key_missing(cached)) {
cli::cli_warn(
c(
"loading from the cache at {cache_environ$epidatr_cache$info()$dir}; ",
"see {cache_environ$epidatr_cache$info()$logfile} for more details."
),
.frequency = "regularly",
.frequency_id = "using the cache",
class = "cache_access"
)
return(cached[[1]])
}
}
'which was saved on {format(cached[[2]],"%A %B %d, %Y")}, which took {round(cached[[3]][[3]], digits=5)} seconds.'
# need to actually get the data, since its either not in the cache or we're not caching
runtime <- system.time(if (epidata_call$only_supports_classic) {
fetched <- fetch_classic(epidata_call, fetch_args)
} else {
fetched <- fetch_tbl(epidata_call, fetch_args)
})
# add it to the cache if appropriate
if (is_cachable) {
cache_environ$epidatr_cache$set(hashed, list(fetched, Sys.time(), runtime))
}
return(fetched)
}
7 changes: 2 additions & 5 deletions R/epidatacall.R
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ fetch_args_list <- function(
#' @return
#' - For `fetch`: a tibble or a JSON-like list
#' @export
#' @include cache.R
#'
fetch <- function(epidata_call, fetch_args = fetch_args_list()) {
stopifnot(inherits(epidata_call, "epidata_call"))
Expand All @@ -203,11 +204,7 @@ fetch <- function(epidata_call, fetch_args = fetch_args_list()) {
return(fetch_debug(epidata_call, fetch_args))
}

if (epidata_call$only_supports_classic) {
return(fetch_classic(epidata_call, fetch_args))
} else {
return(fetch_tbl(epidata_call, fetch_args))
}
cache_epidata_call(epidata_call, fetch_args = fetch_args)
}

#' Fetches the data and returns a tibble
Expand Down
Loading