|
| 1 | +# IMPORTANT DEV NOTE: make sure to @include cache.R in the Roxygen docs of any |
| 2 | +# function referencing this environment, so this file is loaded first |
| 3 | +cache_environ <- new.env(parent = emptyenv()) |
| 4 | +cache_environ$use_cache <- NULL |
| 5 | +cache_environ$epidatr_cache <- NULL |
| 6 | +#' create or renew a cache for this session |
| 7 | +#' @aliases set_cache |
| 8 | +#' @description |
| 9 | +#' By default, epidatr re-requests data from the API on every call of `fetch`. |
| 10 | +#' In case you find yourself repeatedly calling the same data, you can enable |
| 11 | +#' the cache using either this function for a given session, or environmental |
| 12 | +#' variables for a persistent cache. |
| 13 | +#' The typical recommended workflow for using the cache is to set the |
| 14 | +#' environmental variables `EPIDATR_USE_CACHE=TRUE` and |
| 15 | +#' `EPIDATR_CACHE_DIRECTORY="/your/directory/here"`in your `.Renviron`, for |
| 16 | +#' example by calling `usethis::edit_r_environ()`. |
| 17 | +#' See the parameters below for some more configurables if you're so inclined. |
| 18 | +#' |
| 19 | +#' `set_cache` (re)defines the cache to use in a particular R session. This does |
| 20 | +#' not clear existing data at any previous location, but instead creates a |
| 21 | +#' handle to the new cache using [cachem](https://cachem.r-lib.org/index.html) |
| 22 | +#' that seamlessly handles caching for you. |
| 23 | +#' Say your cache is normally stored in some default directory, but for the |
| 24 | +#' current session you want to save your results in |
| 25 | +#' `~/my/temporary/savedirectory`, then you would call `set_cache(dir = |
| 26 | +#' "~/my/temporary/savedirectory")`. |
| 27 | +#' Or if you know the data from 2 days ago is wrong, you could call |
| 28 | +#' `set_cache(days = 1)` to clear older data whenever the cache is referenced. |
| 29 | +#' In both cases, these changes would only last for a single session (though the |
| 30 | +#' deleted data would be gone permanently!). |
| 31 | +#' |
| 32 | +#' An important feature of the caching in this package is that only calls which |
| 33 | +#' specify either `issues` before a certain date, or `as_of` before a certain |
| 34 | +#' date will actually cache. For example the call |
| 35 | +#' ``` |
| 36 | +#' covidcast( |
| 37 | +#' source = "jhu-csse", |
| 38 | +#' signals = "confirmed_7dav_incidence_prop", |
| 39 | +#' geo_type = "state", |
| 40 | +#' time_type = "day", |
| 41 | +#' geo_values = "ca,fl", |
| 42 | +#' time_values = epirange(20200601, 20230801) |
| 43 | +#' ) |
| 44 | +#' ``` |
| 45 | +#' *won't* cache, since it is possible for the cache to be invalidated by new |
| 46 | +#' releases with no warning. On the other hand, the call |
| 47 | +#' ``` |
| 48 | +#' covidcast( |
| 49 | +#' source = "jhu-csse", |
| 50 | +#' signals = "confirmed_7dav_incidence_prop", |
| 51 | +#' geo_type = "state", |
| 52 | +#' time_type = "day", |
| 53 | +#' geo_values = "ca,fl", |
| 54 | +#' time_values = epirange(20200601, 20230801), |
| 55 | +#' as_of = "2023-08-01" |
| 56 | +#' ) |
| 57 | +#' ``` |
| 58 | +#' *will* cache, since normal new versions of data can't invalidate it (since |
| 59 | +#' they would be `as_of` a later date). It is still possible that Delphi may |
| 60 | +#' patch such data, but the frequency is on the order of months rather than |
| 61 | +#' days. We are working on creating a public channel to communicate such |
| 62 | +#' updates. While specifying `issues` will usually cache, a call with |
| 63 | +#' `issues="*"` won't cache, since its subject to cache invalidation by normal |
| 64 | +#' versioning. |
| 65 | +#' |
| 66 | +#' On the backend, the cache uses cachem, with filenames generated using an md5 |
| 67 | +#' encoding of the call url. Each file corresponds to a unique epidata-API |
| 68 | +#' call. |
| 69 | +#' @examples |
| 70 | +#' \dontrun{ |
| 71 | +#' set_cache( |
| 72 | +#' dir = "some/subdir", |
| 73 | +#' days = 14, |
| 74 | +#' max_size = 512, |
| 75 | +#' logfile = "some/subdir/logs.txt", |
| 76 | +#' prune_rate = 20L |
| 77 | +#' ) |
| 78 | +#' } |
| 79 | +#' |
| 80 | +#' @param cache_dir the directory in which the cache is stored. By default, this |
| 81 | +#' is `tools::R_user_dir()` if on R 4.0+, but must be specified for earlier |
| 82 | +#' versions of R. The path can be either relative or absolute. The |
| 83 | +#' environmental variable is `EPIDATR_CACHE_DIR`. |
| 84 | +#' @param days the maximum length of time in days to keep any particular cached |
| 85 | +#' call. By default this is `1`. The environmental variable is |
| 86 | +#' `EPIDATR_CACHE_MAX_AGE_DAYS`. |
| 87 | +#' @param max_size the size of the entire cache, in MB, at which to start |
| 88 | +#' pruning entries. By default this is `1024`, or 1GB. The environmental |
| 89 | +#' variable is `EPIDATR_CACHE_MAX_SIZE_MB`. |
| 90 | +#' @param logfile where cachem's log of transactions is stored, relative to the |
| 91 | +#' cache directory. By default, it is `"logfile.txt"`. The environmental |
| 92 | +#' variable is `EPIDATR_CACHE_LOGFILE`. |
| 93 | +#' @param prune_rate how many calls to go between checking if any cache elements |
| 94 | +#' are too old or if the cache overall is too large. Defaults to `2000L`. |
| 95 | +#' Since cachem fixes the max time between prune checks to 5 seconds, there's |
| 96 | +#' little reason to actually change this parameter. Doesn't have a |
| 97 | +#' corresponding environmental variable. |
| 98 | +#' @param confirm whether to confirm directory creation. default is `TRUE`; |
| 99 | +#' should only be set in non-interactive scripts |
| 100 | +#' @seealso [clear_cache] to delete the old cache while making a new one, |
| 101 | +#' [disable_cache] to disable without deleting, and [cache_info] |
| 102 | +#' @export |
| 103 | +#' @import cachem |
| 104 | +#' @import glue |
| 105 | +#' @importFrom utils sessionInfo |
| 106 | +set_cache <- function(cache_dir = NULL, |
| 107 | + days = NULL, |
| 108 | + max_size = NULL, |
| 109 | + logfile = NULL, |
| 110 | + prune_rate = 2000L, |
| 111 | + confirm = TRUE) { |
| 112 | + if (is.null(cache_dir) && sessionInfo()$R.version$major >= 4) { |
| 113 | + cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR", unset = tools::R_user_dir("epidatr")) |
| 114 | + } else if (is.null(cache_dir)) { |
| 115 | + # earlier version, so no tools |
| 116 | + cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR") |
| 117 | + if (cache_dir == "") { |
| 118 | + cli::cli_abort("no valid EPIDATR_CACHE_DIR", class = "epidatr_cache_error") |
| 119 | + } |
| 120 | + } |
| 121 | + stopifnot(is.character(cache_dir)) |
| 122 | + if (is.null(days)) { |
| 123 | + days <- Sys.getenv("EPIDATR_CACHE_MAX_AGE_DAYS", unset = 1) %>% as.numeric() |
| 124 | + } |
| 125 | + if (is.null(max_size)) { |
| 126 | + max_size <- Sys.getenv("EPIDATR_CACHE_MAX_SIZE_MB", unset = 1024) %>% as.numeric() |
| 127 | + } |
| 128 | + if (is.null(logfile)) { |
| 129 | + logfile <- Sys.getenv("EPIDATR_CACHE_LOGFILE", unset = "logfile.txt") |
| 130 | + } |
| 131 | + stopifnot(is.character(logfile)) |
| 132 | + stopifnot(is.numeric(days), is.numeric(max_size), is.integer(prune_rate)) |
| 133 | + # |
| 134 | + # make sure that that directory exists and drag the user into that process |
| 135 | + cache_exists <- file.exists(cache_dir) |
| 136 | + cache_usable <- file.access(cache_dir, mode = 6) == 0 |
| 137 | + if (!(cache_exists)) { |
| 138 | + if (confirm) { |
| 139 | + user_input <- readline(glue::glue( |
| 140 | + "there is no directory at {cache_dir}; the cache will be turned off until a ", |
| 141 | + "viable directory has been set. Create one? (yes|no(default)) " |
| 142 | + )) |
| 143 | + repeat { |
| 144 | + valid_user_input <- ifelse(grepl("yes|no", user_input), sub(".*(yes|no).*", "\\1", user_input), NA) |
| 145 | + if (user_input == "") { |
| 146 | + valid_user_input <- "" |
| 147 | + } |
| 148 | + if (!is.na(valid_user_input)) { |
| 149 | + break |
| 150 | + } |
| 151 | + user_input <- readline(glue::glue(" please answer either yes or no: ")) |
| 152 | + } |
| 153 | + } else { |
| 154 | + valid_user_input <- "yes" |
| 155 | + } |
| 156 | + if (valid_user_input == "yes") { |
| 157 | + dir.create(cache_dir, showWarnings = TRUE, recursive = TRUE) |
| 158 | + cache_exists <- TRUE |
| 159 | + cache_usable <- file.access(cache_dir, mode = 6) == 0 |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + |
| 164 | + if (!cache_usable) { |
| 165 | + print(glue::glue( |
| 166 | + "The directory at {cache_dir} is not accessible; check permissions and/or use a different ", |
| 167 | + "directory for the cache (see the `set_cache` documentation)." |
| 168 | + )) |
| 169 | + } else if (cache_exists) { |
| 170 | + cache_environ$epidatr_cache <- cachem::cache_disk( |
| 171 | + dir = cache_dir, |
| 172 | + max_size = as.integer(max_size * 1024^2), |
| 173 | + max_age = days * 24 * 60 * 60, |
| 174 | + logfile = file.path(cache_dir, logfile), |
| 175 | + prune_rate = prune_rate |
| 176 | + ) |
| 177 | + } |
| 178 | +} |
| 179 | + |
| 180 | +#' manually reset the cache, deleting all currently saved data and starting afresh |
| 181 | +#' @description |
| 182 | +#' deletes the current cache and resets a new cache. Deletes local data! If you |
| 183 | +#' are using a session unique cache, you will have to pass the arguments you |
| 184 | +#' used for `set_cache` earlier, otherwise the system-wide `.Renviron`-based |
| 185 | +#' defaults will be used. |
| 186 | +#' @examples |
| 187 | +#' \dontrun{ |
| 188 | +#' clear_cache( |
| 189 | +#' dir = "some/subdir", |
| 190 | +#' days = 14, |
| 191 | +#' max_size = 512, |
| 192 | +#' logfile = "some/subdir/logs.txt", |
| 193 | +#' prune_rate = 20L |
| 194 | +#' ) |
| 195 | +#' } |
| 196 | +#' @param disable instead of setting a new cache, disable caching entirely; |
| 197 | +#' defaults to `FALSE` |
| 198 | +#' @param ... see the `set_cache` arguments below |
| 199 | +#' @inheritParams set_cache |
| 200 | +#' @seealso [set_cache] to start a new cache (and general caching info), |
| 201 | +#' [disable_cache] to only disable without deleting, and [cache_info] |
| 202 | +#' @export |
| 203 | +#' @import cachem |
| 204 | +clear_cache <- function(disable = FALSE, ...) { |
| 205 | + cache_environ$epidatr_cache$destroy() |
| 206 | + if (!disable) { |
| 207 | + set_cache(...) |
| 208 | + } else { |
| 209 | + cache_environ$epidatr_cache <- NULL |
| 210 | + } |
| 211 | +} |
| 212 | + |
| 213 | +#' turn off the caching for this session |
| 214 | +#' @description |
| 215 | +#' Disable caching until you call `set_cache` or restart R. The files defining |
| 216 | +#' the cache are untouched. If you are looking to disable the caching more |
| 217 | +#' permanently, set `EPIDATR_USE_CACHE=FALSE` as environmental variable in |
| 218 | +#' your `.Renviron`. |
| 219 | +#' @export |
| 220 | +#' @seealso [set_cache] to start a new cache (and general caching info), |
| 221 | +#' [clear_cache] to delete the cache and set a new one, and [cache_info] |
| 222 | +#' @import cachem |
| 223 | +disable_cache <- function() { |
| 224 | + cache_environ$epidatr_cache <- NULL |
| 225 | +} |
| 226 | + |
| 227 | +#' describe current cache |
| 228 | +#' @description |
| 229 | +#' Print out the information about the cache (as would be returned by cachem's |
| 230 | +#' `info()` method) |
| 231 | +#' @seealso [set_cache] to start a new cache (and general caching info), |
| 232 | +#' [clear_cache] to delete the cache and set a new one, and [disable_cache] to |
| 233 | +#' disable without deleting |
| 234 | +#' @export |
| 235 | +cache_info <- function() { |
| 236 | + if (is.null(cache_environ$epidatr_cache)) { |
| 237 | + return("there is no cache") |
| 238 | + } else { |
| 239 | + return(cache_environ$epidatr_cache$info()) |
| 240 | + } |
| 241 | +} |
| 242 | + |
| 243 | +#' dispatch caching |
| 244 | +#' |
| 245 | +#' @description |
| 246 | +#' the guts of caching, its interposed between fetch and the specific fetch |
| 247 | +#' methods. Internal method only. |
| 248 | +#' |
| 249 | +#' @param call the `epidata_call` object |
| 250 | +#' @param fetch_args the args list for fetch as generated by [fetch_args_list()] |
| 251 | +#' @keywords internal |
| 252 | +#' @import cachem openssl |
| 253 | +cache_epidata_call <- function(epidata_call, fetch_args = fetch_args_list()) { |
| 254 | + is_cachable <- check_is_cachable(epidata_call, fetch_args) |
| 255 | + if (is_cachable) { |
| 256 | + target <- request_url(epidata_call) |
| 257 | + hashed <- md5(target) |
| 258 | + cached <- cache_environ$epidatr_cache$get(hashed) |
| 259 | + as_of_recent <- check_is_recent(epidata_call$params$as_of, 7) |
| 260 | + issues_recent <- check_is_recent(epidata_call$params$issues, 7) |
| 261 | + if (as_of_recent || issues_recent) { |
| 262 | + cli::cli_warn("using cached results with `as_of` within the past week (or the future!). This will likely result ", |
| 263 | + "in an invalid cache. Consider\n", |
| 264 | + "1. disabling the cache for this session with `disable_cache` or permanently with environmental ", |
| 265 | + "variable `EPIDATR_USE_CACHE=FALSE`\n", |
| 266 | + "2. setting `EPIDATR_CACHE_MAX_AGE_DAYS={Sys.getenv('EPIDATR_CACHE_MAX_AGE_DAYS', unset = 1)}` to e.g. `3/24` ", |
| 267 | + "(3 hours).", |
| 268 | + .frequency = "regularly", |
| 269 | + .frequency_id = "cache timing issues", |
| 270 | + class = "cache_recent_data" |
| 271 | + ) |
| 272 | + } |
| 273 | + if (!is.key_missing(cached)) { |
| 274 | + cli::cli_warn( |
| 275 | + c( |
| 276 | + "loading from the cache at {cache_environ$epidatr_cache$info()$dir}; ", |
| 277 | + "see {cache_environ$epidatr_cache$info()$logfile} for more details." |
| 278 | + ), |
| 279 | + .frequency = "regularly", |
| 280 | + .frequency_id = "using the cache", |
| 281 | + class = "cache_access" |
| 282 | + ) |
| 283 | + return(cached[[1]]) |
| 284 | + } |
| 285 | + } |
| 286 | + 'which was saved on {format(cached[[2]],"%A %B %d, %Y")}, which took {round(cached[[3]][[3]], digits=5)} seconds.' |
| 287 | + # need to actually get the data, since its either not in the cache or we're not caching |
| 288 | + runtime <- system.time(if (epidata_call$only_supports_classic) { |
| 289 | + fetched <- fetch_classic(epidata_call, fetch_args) |
| 290 | + } else { |
| 291 | + fetched <- fetch_tbl(epidata_call, fetch_args) |
| 292 | + }) |
| 293 | + # add it to the cache if appropriate |
| 294 | + if (is_cachable) { |
| 295 | + cache_environ$epidatr_cache$set(hashed, list(fetched, Sys.time(), runtime)) |
| 296 | + } |
| 297 | + return(fetched) |
| 298 | +} |
0 commit comments