Skip to content

Commit 1647ea2

Browse files
authored
Merge pull request #1819 from cmu-delphi/ndefries/backfill/speed3
[Backfill corrections] Use faster deduplicate fn for QC
2 parents 2f92ca7 + ef60502 commit 1647ea2

File tree

3 files changed

+15
-13
lines changed

3 files changed

+15
-13
lines changed

backfill_corrections/delphiBackfillCorrection/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ importFrom(dplyr,arrange)
3030
importFrom(dplyr,bind_cols)
3131
importFrom(dplyr,bind_rows)
3232
importFrom(dplyr,desc)
33+
importFrom(dplyr,distinct)
3334
importFrom(dplyr,everything)
3435
importFrom(dplyr,filter)
3536
importFrom(dplyr,full_join)

backfill_corrections/delphiBackfillCorrection/R/main.R

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
run_backfill <- function(df, params,
1919
refd_col = "time_value", lag_col = "lag", issued_col = "issue_date",
2020
signal_suffixes = c(""), indicator = "", signal = "") {
21-
df <- filter(df, lag < params$ref_lag + 30) # a rough filtration to save memory
22-
2321
geo_levels <- params$geo_levels
2422
if ("state" %in% geo_levels) {
2523
# If state included, do it last since state processing modifies the
@@ -317,14 +315,12 @@ main <- function(params,
317315

318316
msg_ts("Reading in and combining associated files")
319317
input_data <- lapply(
320-
files_list,
321-
function(file) {
322-
# refd_col and issued_col read in as strings
323-
read_data(file) %>%
324-
fips_to_geovalue()
325-
}
318+
files_list, read_data # refd_col and issued_col read in as strings
326319
) %>%
327-
bind_rows()
320+
bind_rows() %>%
321+
fips_to_geovalue() %>%
322+
# a rough filter to save memory
323+
filter(lag < params$ref_lag + 30)
328324

329325
if (nrow(input_data) == 0) {
330326
warning("No data available for indicator ", input_group$indicator,

backfill_corrections/delphiBackfillCorrection/R/utils.R

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ create_dir_not_exist <- function(path)
169169
#' @return list of input dataframe augmented with lag column, if it
170170
#' didn't already exist, and character vector of one or two value
171171
#' column names, depending on requested `value_type`
172+
#'
173+
#' @importFrom dplyr distinct across
172174
validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes,
173175
refd_col = "time_value", lag_col = "lag", issued_col = "issue_date") {
174176
if (!missing(signal_suffixes) && !is.na(signal_suffixes) && !all(signal_suffixes == "") && !all(is.na(signal_suffixes))) {
@@ -205,13 +207,16 @@ validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes
205207
}
206208

207209
# Drop duplicate rows.
208-
duplicate_i <- duplicated(df)
209-
if (any(duplicate_i)) {
210+
raw_df_rows <- nrow(df)
211+
df <- distinct(df)
212+
new_df_rows <- nrow(df)
213+
if (raw_df_rows != new_df_rows) {
210214
warning("Data contains duplicate rows, dropping")
211-
df <- df[!duplicate_i,]
212215
}
213216

214-
if (anyDuplicated(df[, c(refd_col, issued_col, "geo_value", "state_id")])) {
217+
if (new_df_rows != nrow(
218+
distinct(df, across(c(refd_col, issued_col, "geo_value", "state_id")))
219+
)) {
215220
stop("Data contains multiple entries with differing values for at",
216221
" least one reference date-issue date-location combination")
217222
}

0 commit comments

Comments
 (0)