Skip to content

Step through pre-calculated start times for each group using closure rather than using .real col in epi_slide #397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: epiprocess
Title: Tools for basic signal processing in epidemiology
Version: 0.7.1.9999
Version: 0.7.2.9999
Authors@R: c(
person("Jacob", "Bien", role = "ctb"),
person("Logan", "Brooks", role = "aut"),
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# epiprocess 0.7.2.9999

## Improvements

* `epi_slide` computations are now 2-4 times faster after changing how
reference time values, made accessible within sliding functions, are
calculated (#397).

# epiprocess 0.7.1.9999

Note that `epiprocess` uses the [Semantic Versioning
Expand Down
96 changes: 32 additions & 64 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,84 +230,56 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
after <- time_step(after)
}

min_ref_time_values <- ref_time_values - before
min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))]

# Do set up to let us recover `ref_time_value`s later.
# A helper column marking real observations.
x$.real <- TRUE

# Create df containing phony data. Df has the same columns and attributes as
# `x`, but filled with `NA`s aside from grouping columns. Number of rows is
# equal to the number of `min_ref_time_values_not_in_x` we have * the
# number of unique levels seen in the grouping columns.
before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x)
if (length(group_vars(x)) != 0) {
before_time_values_df <- dplyr::cross_join(
# Get unique combinations of grouping columns seen in real data.
unique(x[, group_vars(x)]),
before_time_values_df
)
}
# Automatically fill in all other columns from `x` with `NA`s, and carry
# attributes over to new df.
before_time_values_df <- bind_rows(x[0, ], before_time_values_df)
before_time_values_df$.real <- FALSE

x <- bind_rows(before_time_values_df, x)

# Arrange by increasing time_value
x <- arrange(x, time_value)

# Now set up starts and stops for sliding/hopping
time_range <- range(unique(x$time_value))
starts <- in_range(ref_time_values - before, time_range)
stops <- in_range(ref_time_values + after, time_range)

if (length(starts) == 0 || length(stops) == 0) {
Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
}
starts <- ref_time_values - before
stops <- ref_time_values + after

# Symbolize new column name
new_col <- sym(new_col_name)

# Computation for one group, all time values
slide_one_grp <- function(.data_group,
f, ...,
.group_key, # see `?group_modify`
..., # `...` to `epi_slide` forwarded here
f_factory,
starts,
stops,
time_values,
ref_time_values,
all_rows,
new_col) {
# Figure out which reference time values appear in the data group in the
# first place (we need to do this because it could differ based on the
# group, hence the setup/checks for the reference time values based on all
# the data could still be off)
o <- time_values %in% .data_group$time_value
# the data could still be off):
o <- ref_time_values %in% .data_group$time_value
starts <- starts[o]
stops <- stops[o]
time_values <- time_values[o]
kept_ref_time_values <- ref_time_values[o]

f <- f_factory(kept_ref_time_values)

# Compute the slide values
slide_values_list <- slider::hop_index(
.x = .data_group,
.i = .data_group$time_value,
.f = f, ...,
.starts = starts,
.stops = stops
.stops = stops,
.f = f,
.group_key, ...
)

# Now figure out which rows in the data group are in the reference time
# values; this will be useful for all sorts of checks that follow
o <- .data_group$time_value %in% time_values
o <- .data_group$time_value %in% kept_ref_time_values
num_ref_rows <- sum(o)

# Count the number of appearances of each reference time value (these
# appearances should all be real for now, but if we allow ref time values
# outside of .data_group's time values):
counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
# Count the number of appearances of each kept reference time value.
counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>%
dplyr::count(.data$time_value) %>%
dplyr::pull(n)
`[[`("n")

if (!all(purrr::map_lgl(slide_values_list, is.atomic)) &&
!all(purrr::map_lgl(slide_values_list, is.data.frame))) {
Expand Down Expand Up @@ -349,7 +321,6 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
# fills with NA equivalent.
vctrs::vec_slice(slide_values, o) <- orig_values
} else {
# This implicitly removes phony (`.real` == FALSE) observations.
.data_group <- filter(.data_group, o)
}
return(mutate(.data_group, !!new_col := slide_values))
Expand All @@ -372,18 +343,24 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,

f <- as_slide_computation(f, ...)
# Create a wrapper that calculates and passes `.ref_time_value` to the
# computation.
f_wrapper <- function(.x, .group_key, ...) {
.ref_time_value <- min(.x$time_value) + before
.x <- .x[.x$.real, ]
.x$.real <- NULL
f(.x, .group_key, .ref_time_value, ...)
# computation. `i` is contained in the `f_wrapper_factory` environment such
# that when called within `slide_one_grp` `i` is reset for every group.
f_wrapper_factory <- function(kept_ref_time_values) {
# Use `i` to advance through list of start dates.
i <- 1L
f_wrapper <- function(.x, .group_key, ...) {
.ref_time_value <- kept_ref_time_values[[i]]
i <<- i + 1L
f(.x, .group_key, .ref_time_value, ...)
}
return(f_wrapper)
}
x <- group_modify(x, slide_one_grp,
f = f_wrapper, ...,
...,
f_factory = f_wrapper_factory,
starts = starts,
stops = stops,
time_values = ref_time_values,
ref_time_values = ref_time_values,
all_rows = all_rows,
new_col = new_col,
.keep = FALSE
Expand All @@ -394,14 +371,5 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
x <- unnest(x, !!new_col, names_sep = names_sep)
}

# Remove any remaining phony observations. When `all_rows` is TRUE, phony
# observations aren't necessarily removed in `slide_one_grp`.
if (all_rows) {
x <- x[x$.real, ]
}

# Drop helper column `.real`.
x$.real <- NULL

return(x)
}
4 changes: 0 additions & 4 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,6 @@ as_slide_computation <- function(f, ...) {

##########

in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2])

##########

Min <- function(x) min(x, na.rm = TRUE)
Max <- function(x) max(x, na.rm = TRUE)
Sum <- function(x) sum(x, na.rm = TRUE)
Expand Down
56 changes: 34 additions & 22 deletions man/as_epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 26 additions & 18 deletions man/as_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading