Skip to content

Commit 1e45282

Browse files
authored
Merge pull request #153 from cmu-delphi/dsweber2/caching
add caching
2 parents ef674ea + e6ca8f2 commit 1e45282

16 files changed

+810
-5
lines changed

DESCRIPTION

+18
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ Encoding: UTF-8
2424
LazyData: true
2525
Roxygen: list(markdown = TRUE)
2626
Imports:
27+
cachem,
2728
checkmate,
2829
cli,
2930
httr,
31+
glue,
3032
jsonlite,
3133
magrittr,
3234
MMWRweek,
3335
purrr,
36+
openssl,
3437
readr,
38+
rlang,
3539
tibble,
3640
xml2
3741
RoxygenNote: 7.2.3
@@ -45,3 +49,17 @@ Suggests:
4549
VignetteBuilder: knitr
4650
Language: en-US
4751
Config/testthat/edition: 3
52+
Collate:
53+
'auth.R'
54+
'avail_endpoints.R'
55+
'cache.R'
56+
'check.R'
57+
'constants.R'
58+
'covidcast.R'
59+
'endpoints.R'
60+
'epidatacall.R'
61+
'epidatr-package.R'
62+
'model.R'
63+
'request.R'
64+
'utils-pipe.R'
65+
'utils.R'

NAMESPACE

+8
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ S3method(print,covidcast_epidata)
88
S3method(print,epidata_call)
99
export("%>%")
1010
export(avail_endpoints)
11+
export(cache_info)
12+
export(clear_cache)
1113
export(covid_hosp_facility)
1214
export(covid_hosp_facility_lookup)
1315
export(covid_hosp_state_timeseries)
@@ -16,6 +18,7 @@ export(covidcast_epidata)
1618
export(covidcast_meta)
1719
export(delphi)
1820
export(dengue_nowcast)
21+
export(disable_cache)
1922
export(ecdc_ili)
2023
export(epirange)
2124
export(fetch)
@@ -40,7 +43,11 @@ export(pvt_norostat)
4043
export(pvt_quidel)
4144
export(pvt_sensors)
4245
export(pvt_twitter)
46+
export(set_cache)
4347
export(wiki)
48+
import(cachem)
49+
import(glue)
50+
import(openssl)
4451
importFrom(MMWRweek,MMWRweek2Date)
4552
importFrom(checkmate,assert)
4653
importFrom(checkmate,assert_character)
@@ -72,6 +79,7 @@ importFrom(readr,read_csv)
7279
importFrom(tibble,as_tibble)
7380
importFrom(tibble,tibble)
7481
importFrom(utils,help.search)
82+
importFrom(utils,sessionInfo)
7583
importFrom(xml2,read_html)
7684
importFrom(xml2,xml_find_all)
7785
importFrom(xml2,xml_text)

R/cache.R

+298
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
# IMPORTANT DEV NOTE: make sure to @include cache.R in the Roxygen docs of any
2+
# function referencing this environment, so this file is loaded first
3+
cache_environ <- new.env(parent = emptyenv())
4+
cache_environ$use_cache <- NULL
5+
cache_environ$epidatr_cache <- NULL
6+
#' create or renew a cache for this session
7+
#' @aliases set_cache
8+
#' @description
9+
#' By default, epidatr re-requests data from the API on every call of `fetch`.
10+
#' In case you find yourself repeatedly calling the same data, you can enable
11+
#' the cache using either this function for a given session, or environmental
12+
#' variables for a persistent cache.
13+
#' The typical recommended workflow for using the cache is to set the
14+
#' environmental variables `EPIDATR_USE_CACHE=TRUE` and
15+
#' `EPIDATR_CACHE_DIRECTORY="/your/directory/here"`in your `.Renviron`, for
16+
#' example by calling `usethis::edit_r_environ()`.
17+
#' See the parameters below for some more configurables if you're so inclined.
18+
#'
19+
#' `set_cache` (re)defines the cache to use in a particular R session. This does
20+
#' not clear existing data at any previous location, but instead creates a
21+
#' handle to the new cache using [cachem](https://cachem.r-lib.org/index.html)
22+
#' that seamlessly handles caching for you.
23+
#' Say your cache is normally stored in some default directory, but for the
24+
#' current session you want to save your results in
25+
#' `~/my/temporary/savedirectory`, then you would call `set_cache(dir =
26+
#' "~/my/temporary/savedirectory")`.
27+
#' Or if you know the data from 2 days ago is wrong, you could call
28+
#' `set_cache(days = 1)` to clear older data whenever the cache is referenced.
29+
#' In both cases, these changes would only last for a single session (though the
30+
#' deleted data would be gone permanently!).
31+
#'
32+
#' An important feature of the caching in this package is that only calls which
33+
#' specify either `issues` before a certain date, or `as_of` before a certain
34+
#' date will actually cache. For example the call
35+
#' ```
36+
#' covidcast(
37+
#' source = "jhu-csse",
38+
#' signals = "confirmed_7dav_incidence_prop",
39+
#' geo_type = "state",
40+
#' time_type = "day",
41+
#' geo_values = "ca,fl",
42+
#' time_values = epirange(20200601, 20230801)
43+
#' )
44+
#' ```
45+
#' *won't* cache, since it is possible for the cache to be invalidated by new
46+
#' releases with no warning. On the other hand, the call
47+
#' ```
48+
#' covidcast(
49+
#' source = "jhu-csse",
50+
#' signals = "confirmed_7dav_incidence_prop",
51+
#' geo_type = "state",
52+
#' time_type = "day",
53+
#' geo_values = "ca,fl",
54+
#' time_values = epirange(20200601, 20230801),
55+
#' as_of = "2023-08-01"
56+
#' )
57+
#' ```
58+
#' *will* cache, since normal new versions of data can't invalidate it (since
59+
#' they would be `as_of` a later date). It is still possible that Delphi may
60+
#' patch such data, but the frequency is on the order of months rather than
61+
#' days. We are working on creating a public channel to communicate such
62+
#' updates. While specifying `issues` will usually cache, a call with
63+
#' `issues="*"` won't cache, since its subject to cache invalidation by normal
64+
#' versioning.
65+
#'
66+
#' On the backend, the cache uses cachem, with filenames generated using an md5
67+
#' encoding of the call url. Each file corresponds to a unique epidata-API
68+
#' call.
69+
#' @examples
70+
#' \dontrun{
71+
#' set_cache(
72+
#' dir = "some/subdir",
73+
#' days = 14,
74+
#' max_size = 512,
75+
#' logfile = "some/subdir/logs.txt",
76+
#' prune_rate = 20L
77+
#' )
78+
#' }
79+
#'
80+
#' @param cache_dir the directory in which the cache is stored. By default, this
81+
#' is `tools::R_user_dir()` if on R 4.0+, but must be specified for earlier
82+
#' versions of R. The path can be either relative or absolute. The
83+
#' environmental variable is `EPIDATR_CACHE_DIR`.
84+
#' @param days the maximum length of time in days to keep any particular cached
85+
#' call. By default this is `1`. The environmental variable is
86+
#' `EPIDATR_CACHE_MAX_AGE_DAYS`.
87+
#' @param max_size the size of the entire cache, in MB, at which to start
88+
#' pruning entries. By default this is `1024`, or 1GB. The environmental
89+
#' variable is `EPIDATR_CACHE_MAX_SIZE_MB`.
90+
#' @param logfile where cachem's log of transactions is stored, relative to the
91+
#' cache directory. By default, it is `"logfile.txt"`. The environmental
92+
#' variable is `EPIDATR_CACHE_LOGFILE`.
93+
#' @param prune_rate how many calls to go between checking if any cache elements
94+
#' are too old or if the cache overall is too large. Defaults to `2000L`.
95+
#' Since cachem fixes the max time between prune checks to 5 seconds, there's
96+
#' little reason to actually change this parameter. Doesn't have a
97+
#' corresponding environmental variable.
98+
#' @param confirm whether to confirm directory creation. default is `TRUE`;
99+
#' should only be set in non-interactive scripts
100+
#' @seealso [clear_cache] to delete the old cache while making a new one,
101+
#' [disable_cache] to disable without deleting, and [cache_info]
102+
#' @export
103+
#' @import cachem
104+
#' @import glue
105+
#' @importFrom utils sessionInfo
106+
set_cache <- function(cache_dir = NULL,
107+
days = NULL,
108+
max_size = NULL,
109+
logfile = NULL,
110+
prune_rate = 2000L,
111+
confirm = TRUE) {
112+
if (is.null(cache_dir) && sessionInfo()$R.version$major >= 4) {
113+
cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR", unset = tools::R_user_dir("epidatr"))
114+
} else if (is.null(cache_dir)) {
115+
# earlier version, so no tools
116+
cache_dir <- Sys.getenv("EPIDATR_CACHE_DIR")
117+
if (cache_dir == "") {
118+
cli::cli_abort("no valid EPIDATR_CACHE_DIR", class = "epidatr_cache_error")
119+
}
120+
}
121+
stopifnot(is.character(cache_dir))
122+
if (is.null(days)) {
123+
days <- Sys.getenv("EPIDATR_CACHE_MAX_AGE_DAYS", unset = 1) %>% as.numeric()
124+
}
125+
if (is.null(max_size)) {
126+
max_size <- Sys.getenv("EPIDATR_CACHE_MAX_SIZE_MB", unset = 1024) %>% as.numeric()
127+
}
128+
if (is.null(logfile)) {
129+
logfile <- Sys.getenv("EPIDATR_CACHE_LOGFILE", unset = "logfile.txt")
130+
}
131+
stopifnot(is.character(logfile))
132+
stopifnot(is.numeric(days), is.numeric(max_size), is.integer(prune_rate))
133+
#
134+
# make sure that that directory exists and drag the user into that process
135+
cache_exists <- file.exists(cache_dir)
136+
cache_usable <- file.access(cache_dir, mode = 6) == 0
137+
if (!(cache_exists)) {
138+
if (confirm) {
139+
user_input <- readline(glue::glue(
140+
"there is no directory at {cache_dir}; the cache will be turned off until a ",
141+
"viable directory has been set. Create one? (yes|no(default)) "
142+
))
143+
repeat {
144+
valid_user_input <- ifelse(grepl("yes|no", user_input), sub(".*(yes|no).*", "\\1", user_input), NA)
145+
if (user_input == "") {
146+
valid_user_input <- ""
147+
}
148+
if (!is.na(valid_user_input)) {
149+
break
150+
}
151+
user_input <- readline(glue::glue(" please answer either yes or no: "))
152+
}
153+
} else {
154+
valid_user_input <- "yes"
155+
}
156+
if (valid_user_input == "yes") {
157+
dir.create(cache_dir, showWarnings = TRUE, recursive = TRUE)
158+
cache_exists <- TRUE
159+
cache_usable <- file.access(cache_dir, mode = 6) == 0
160+
}
161+
}
162+
163+
164+
if (!cache_usable) {
165+
print(glue::glue(
166+
"The directory at {cache_dir} is not accessible; check permissions and/or use a different ",
167+
"directory for the cache (see the `set_cache` documentation)."
168+
))
169+
} else if (cache_exists) {
170+
cache_environ$epidatr_cache <- cachem::cache_disk(
171+
dir = cache_dir,
172+
max_size = as.integer(max_size * 1024^2),
173+
max_age = days * 24 * 60 * 60,
174+
logfile = file.path(cache_dir, logfile),
175+
prune_rate = prune_rate
176+
)
177+
}
178+
}
179+
180+
#' manually reset the cache, deleting all currently saved data and starting afresh
181+
#' @description
182+
#' deletes the current cache and resets a new cache. Deletes local data! If you
183+
#' are using a session unique cache, you will have to pass the arguments you
184+
#' used for `set_cache` earlier, otherwise the system-wide `.Renviron`-based
185+
#' defaults will be used.
186+
#' @examples
187+
#' \dontrun{
188+
#' clear_cache(
189+
#' dir = "some/subdir",
190+
#' days = 14,
191+
#' max_size = 512,
192+
#' logfile = "some/subdir/logs.txt",
193+
#' prune_rate = 20L
194+
#' )
195+
#' }
196+
#' @param disable instead of setting a new cache, disable caching entirely;
197+
#' defaults to `FALSE`
198+
#' @param ... see the `set_cache` arguments below
199+
#' @inheritParams set_cache
200+
#' @seealso [set_cache] to start a new cache (and general caching info),
201+
#' [disable_cache] to only disable without deleting, and [cache_info]
202+
#' @export
203+
#' @import cachem
204+
clear_cache <- function(disable = FALSE, ...) {
205+
cache_environ$epidatr_cache$destroy()
206+
if (!disable) {
207+
set_cache(...)
208+
} else {
209+
cache_environ$epidatr_cache <- NULL
210+
}
211+
}
212+
213+
#' turn off the caching for this session
214+
#' @description
215+
#' Disable caching until you call `set_cache` or restart R. The files defining
216+
#' the cache are untouched. If you are looking to disable the caching more
217+
#' permanently, set `EPIDATR_USE_CACHE=FALSE` as environmental variable in
218+
#' your `.Renviron`.
219+
#' @export
220+
#' @seealso [set_cache] to start a new cache (and general caching info),
221+
#' [clear_cache] to delete the cache and set a new one, and [cache_info]
222+
#' @import cachem
223+
disable_cache <- function() {
224+
cache_environ$epidatr_cache <- NULL
225+
}
226+
227+
#' describe current cache
228+
#' @description
229+
#' Print out the information about the cache (as would be returned by cachem's
230+
#' `info()` method)
231+
#' @seealso [set_cache] to start a new cache (and general caching info),
232+
#' [clear_cache] to delete the cache and set a new one, and [disable_cache] to
233+
#' disable without deleting
234+
#' @export
235+
cache_info <- function() {
236+
if (is.null(cache_environ$epidatr_cache)) {
237+
return("there is no cache")
238+
} else {
239+
return(cache_environ$epidatr_cache$info())
240+
}
241+
}
242+
243+
#' dispatch caching
244+
#'
245+
#' @description
246+
#' the guts of caching, its interposed between fetch and the specific fetch
247+
#' methods. Internal method only.
248+
#'
249+
#' @param call the `epidata_call` object
250+
#' @param fetch_args the args list for fetch as generated by [fetch_args_list()]
251+
#' @keywords internal
252+
#' @import cachem openssl
253+
cache_epidata_call <- function(epidata_call, fetch_args = fetch_args_list()) {
254+
is_cachable <- check_is_cachable(epidata_call, fetch_args)
255+
if (is_cachable) {
256+
target <- request_url(epidata_call)
257+
hashed <- md5(target)
258+
cached <- cache_environ$epidatr_cache$get(hashed)
259+
as_of_recent <- check_is_recent(epidata_call$params$as_of, 7)
260+
issues_recent <- check_is_recent(epidata_call$params$issues, 7)
261+
if (as_of_recent || issues_recent) {
262+
cli::cli_warn("using cached results with `as_of` within the past week (or the future!). This will likely result ",
263+
"in an invalid cache. Consider\n",
264+
"1. disabling the cache for this session with `disable_cache` or permanently with environmental ",
265+
"variable `EPIDATR_USE_CACHE=FALSE`\n",
266+
"2. setting `EPIDATR_CACHE_MAX_AGE_DAYS={Sys.getenv('EPIDATR_CACHE_MAX_AGE_DAYS', unset = 1)}` to e.g. `3/24` ",
267+
"(3 hours).",
268+
.frequency = "regularly",
269+
.frequency_id = "cache timing issues",
270+
class = "cache_recent_data"
271+
)
272+
}
273+
if (!is.key_missing(cached)) {
274+
cli::cli_warn(
275+
c(
276+
"loading from the cache at {cache_environ$epidatr_cache$info()$dir}; ",
277+
"see {cache_environ$epidatr_cache$info()$logfile} for more details."
278+
),
279+
.frequency = "regularly",
280+
.frequency_id = "using the cache",
281+
class = "cache_access"
282+
)
283+
return(cached[[1]])
284+
}
285+
}
286+
'which was saved on {format(cached[[2]],"%A %B %d, %Y")}, which took {round(cached[[3]][[3]], digits=5)} seconds.'
287+
# need to actually get the data, since its either not in the cache or we're not caching
288+
runtime <- system.time(if (epidata_call$only_supports_classic) {
289+
fetched <- fetch_classic(epidata_call, fetch_args)
290+
} else {
291+
fetched <- fetch_tbl(epidata_call, fetch_args)
292+
})
293+
# add it to the cache if appropriate
294+
if (is_cachable) {
295+
cache_environ$epidatr_cache$set(hashed, list(fetched, Sys.time(), runtime))
296+
}
297+
return(fetched)
298+
}

R/epidatacall.R

+2-5
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ fetch_args_list <- function(
186186
#' @return
187187
#' - For `fetch`: a tibble or a JSON-like list
188188
#' @export
189+
#' @include cache.R
189190
#'
190191
fetch <- function(epidata_call, fetch_args = fetch_args_list()) {
191192
stopifnot(inherits(epidata_call, "epidata_call"))
@@ -203,11 +204,7 @@ fetch <- function(epidata_call, fetch_args = fetch_args_list()) {
203204
return(fetch_debug(epidata_call, fetch_args))
204205
}
205206

206-
if (epidata_call$only_supports_classic) {
207-
return(fetch_classic(epidata_call, fetch_args))
208-
} else {
209-
return(fetch_tbl(epidata_call, fetch_args))
210-
}
207+
cache_epidata_call(epidata_call, fetch_args = fetch_args)
211208
}
212209

213210
#' Fetches the data and returns a tibble

0 commit comments

Comments
 (0)