-
Notifications
You must be signed in to change notification settings - Fork 8
Speed up duplicate detection in as_epi_df()
#560
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
I ordered selection & filtering in probably a suboptimal way in
My R installation isn't configured for memory profiling, so no memory comparison results. But it should all be in temporary objects. Some smart-copy-on-write might be disabled permanently (and more likely in I'm planning to swap in the
Note: the Note 2: library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(epidatasets)
library(epipredict)
#> Loading required package: epiprocess
#> Registered S3 method overwritten by 'tsibble':
#> method from
#> as_tibble.grouped_df dplyr
#>
#> Attaching package: 'epiprocess'
#> The following object is masked from 'package:stats':
#>
#> filter
#> Loading required package: parsnip
#> Registered S3 method overwritten by 'epipredict':
#> method from
#> print.step_naomit recipes
dup_check1 <- function(x, other_keys) {
duplicated_time_values <- x %>%
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
filter(dplyr::n() > 1) %>%
ungroup()
nrow(duplicated_time_values) != 0
}
dup_check2 <- function(x, other_keys) {
anyDuplicated(x[c("geo_value", "time_value", other_keys)]) != 0L
}
dup_check3 <- function(x, other_keys) {
if (nrow(x) <= 1L) {
FALSE
} else {
epikeytime_names <- c("geo_value", "time_value", other_keys)
arranged <- arrange(x, across(all_of(epikeytime_names)))
arranged_epikeytimes <- arranged[epikeytime_names]
any(vctrs::vec_equal(arranged_epikeytimes[-1L,], arranged_epikeytimes[-nrow(arranged_epikeytimes),]))
}
}
dup_check4 <- function(x, other_keys) {
if (nrow(x) <= 1L) {
FALSE
} else {
ukey_names <- c("geo_value", "time_value", other_keys)
arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
any(vctrs::vec_equal(arranged_ukeys[-1L,], arranged_ukeys[-nrow(arranged_ukeys),]))
}
}
test_tbl <- as_tibble(covid_case_death_rates_extended)
bench::mark(
dup_check1(test_tbl, character()),
dup_check2(test_tbl, character()),
dup_check3(test_tbl, character()),
dup_check4(test_tbl, character()),
min_time = 3
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl, character… 293.93ms 298.49ms 3.29 NA 7.57
#> 2 dup_check2(test_tbl, character… 150.56ms 176.29ms 5.06 NA 11.4
#> 3 dup_check3(test_tbl, character… 3.88ms 5.42ms 169. NA 20.3
#> 4 dup_check4(test_tbl, character… 3.78ms 3.92ms 224. NA 22.3
objs <- tibble::lst(
cases_deaths_subset,
counts_subset,
county_smoothed_cli_comparison,
covid_case_death_rates,
covid_case_death_rates_extended,
covid_confirmed_cumulative_num,
covid_incidence_county_subset,
covid_incidence_outliers,
ctis_covid_behaviours,
)
for(nm in names(objs)) {
print(nm)
print(dup_check4(objs[[nm]], character()))
print(bench::mark(
dup_check1(objs[[nm]], character()),
dup_check2(objs[[nm]], character()),
dup_check3(objs[[nm]], character()),
dup_check4(objs[[nm]], character()),
min_time = 3
))
}
#> [1] "cases_deaths_subset"
#> [1] FALSE
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dup_check… 35.89ms 37.02ms 26.9 NA 9.72 58 21 2.16s
#> 2 dup_check… 14.33ms 15.67ms 62.9 NA 14.4 144 33 2.29s
#> 3 dup_check… 3.04ms 3.16ms 302. NA 4.49 874 13 2.89s
#> 4 dup_check… 3.01ms 3.13ms 310. NA 4.12 904 12 2.92s
#> # ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
#> [1] "counts_subset"
#> [1] FALSE
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dup_check… 14.72ms 15.36ms 64.8 NA 6.64 166 17 2.56s
#> 2 dup_check… 3.74ms 3.98ms 251. NA 12.0 689 33 2.75s
#> 3 dup_check… 2.65ms 2.83ms 351. NA 4.82 1018 14 2.9s
#> 4 dup_check… 2.69ms 2.83ms 347. NA 4.46 1010 13 2.91s
#> # ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
#> [1] "county_smoothed_cli_comparison"
#> [1] FALSE
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#> 1 dup_check1(objs[[n… 527.89ms 531.3ms 1.87 NA 7.47 6 24
#> 2 dup_check2(objs[[n… 256.02ms 305.89ms 2.85 NA 11.4 9 36
#> 3 dup_check3(objs[[n… 8.48ms 9.11ms 93.7 NA 18.3 282 55
#> 4 dup_check4(objs[[n… 8ms 8.63ms 102. NA 13.3 307 40
#> # ℹ 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
#> # time <list>, gc <list>
#> [1] "covid_case_death_rates"
#> [1] FALSE
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#> 1 dup_check1(objs[[n… 159.46ms 163.32ms 6.09 NA 7.69 19 24
#> 2 dup_check2(objs[[n… 83.33ms 98.08ms 9.39 NA 11.3 29 35
#> 3 dup_check3(objs[[n… 4.36ms 5.08ms 185. NA 8.65 556 26
#> 4 dup_check4(objs[[n… 4.17ms 4.63ms 191. NA 8.32 574 25
#> # ℹ 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
#> # time <list>, gc <list>
#> [1] "covid_case_death_rates_extended"
#> [1] FALSE
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#> 1 dup_check1(objs[[n… 285.94ms 302.54ms 3.34 NA 7.89 11 26
#> 2 dup_check2(objs[[n… 153.32ms 189.23ms 4.89 NA 10.4 15 32
#> 3 dup_check3(objs[[n… 5.73ms 5.99ms 142. NA 12.5 443 39
#> 4 dup_check4(objs[[n… 5.59ms 5.78ms 164. NA 11.7 491 35
#> # ℹ 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
#> # time <list>, gc <list>
#> [1] "covid_confirmed_cumulative_num"
#> [1] FALSE
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dup_check… 23.94ms 24.87ms 39.9 NA 8.23 97 20 2.43s
#> 2 dup_check… 9.88ms 10.65ms 94.0 NA 14.0 222 33 2.36s
#> 3 dup_check… 2.34ms 2.44ms 394. NA 4.48 1142 13 2.9s
#> 4 dup_check… 2.22ms 2.27ms 426. NA 4.10 1246 12 2.93s
#> # ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
#> [1] "covid_incidence_county_subset"
#> [1] FALSE
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#> 1 dup_check1(objs[[n… 133.29ms 136.23ms 7.35 NA 7.35 23 23
#> 2 dup_check2(objs[[n… 61.42ms 75.97ms 12.1 NA 10.8 37 33
#> 3 dup_check3(objs[[n… 4.14ms 4.41ms 206. NA 7.33 618 22
#> 4 dup_check4(objs[[n… 3.82ms 4.17ms 216. NA 6.99 648 21
#> # ℹ 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
#> # time <list>, gc <list>
#> [1] "covid_incidence_outliers"
#> [1] FALSE
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dup_check… 12.38ms 12.78ms 77.4 NA 5.81 213 16 2.75s
#> 2 dup_check… 2.63ms 2.82ms 350. NA 11.8 976 33 2.79s
#> 3 dup_check… 2.71ms 2.8ms 346. NA 4.46 1009 13 2.91s
#> 4 dup_check… 2.66ms 2.79ms 348. NA 4.81 1013 14 2.91s
#> # ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
#> [1] "ctis_covid_behaviours"
#> [1] FALSE
#> # A tibble: 4 × 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dup_check… 14.75ms 15.39ms 64.5 NA 6.21 166 16 2.58s
#> 2 dup_check… 3.77ms 4.04ms 246. NA 12.0 675 33 2.74s
#> 3 dup_check… 2.75ms 2.85ms 342. NA 4.81 994 14 2.91s
#> 4 dup_check… 2.74ms 2.82ms 347. NA 4.46 1011 13 2.92s
#> # ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>
print(dup_check4(grad_employ_subset, attr(grad_employ_subset, "metadata")$other_keys))
#> [1] FALSE
bench::mark(
dup_check1(grad_employ_subset, attr(grad_employ_subset, "metadata")$other_keys),
dup_check2(grad_employ_subset, attr(grad_employ_subset, "metadata")$other_keys),
dup_check3(grad_employ_subset, attr(grad_employ_subset, "metadata")$other_keys),
dup_check4(grad_employ_subset, attr(grad_employ_subset, "metadata")$other_keys),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(grad_employ_subset, … 18.94ms 19.92ms 49.8 NA 7.01
#> 2 "dup_check2(grad_employ_subset, … 12.97ms 14.04ms 70.6 NA 13.1
#> 3 "dup_check3(grad_employ_subset, … 2.92ms 3.02ms 322. NA 4.69
#> 4 "dup_check4(grad_employ_subset, … 2.84ms 3.13ms 312. NA 4.46
bad1 <- rbind(grad_employ_subset, grad_employ_subset[800,])
bench::mark(
dup_check1(bad1, attr(bad1, "metadata")$other_keys),
dup_check2(bad1, attr(bad1, "metadata")$other_keys),
dup_check3(bad1, attr(bad1, "metadata")$other_keys),
dup_check4(bad1, attr(bad1, "metadata")$other_keys),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(bad1, attr(bad1, \"m… 18.74ms 19.27ms 51.4 NA 6.95
#> 2 "dup_check2(bad1, attr(bad1, \"m… 12.49ms 13.55ms 72.7 NA 13.5
#> 3 "dup_check3(bad1, attr(bad1, \"m… 2.9ms 3.02ms 321. NA 4.84
#> 4 "dup_check4(bad1, attr(bad1, \"m… 2.94ms 3.04ms 317. NA 4.47
dup_check4(bad1, attr(bad1, "metadata")$other_keys)
#> [1] TRUE
# from ?arx_forecaster:
jhu <- covid_case_death_rates %>%
dplyr::filter(time_value >= as.Date("2021-12-01"))
out <- arx_forecaster(jhu, "death_rate",
c("case_rate", "death_rate"),
trainer = quantile_reg(),
args_list = arx_args_list(quantile_levels = 1:9 / 10)
)
fc_test1 <- out$predictions %>%
transmute(geo_value, time_value = target_date, .pred, .pred_distn) %>%
as_epi_df(as_of = attr(jhu, "metadata")$as_of)
dup_check4(fc_test1, character())
#> [1] FALSE
bench::mark(
dup_check1(fc_test1, attr(fc_test1, "metadata")$other_keys),
dup_check2(fc_test1, attr(fc_test1, "metadata")$other_keys),
dup_check3(fc_test1, attr(fc_test1, "metadata")$other_keys),
dup_check4(fc_test1, attr(fc_test1, "metadata")$other_keys),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(fc_test1, attr(fc_… 7.79ms 8.19ms 118. NA 5.75
#> 2 "dup_check2(fc_test1, attr(fc_… 301.24µs 322.18µs 3005. NA 7.57
#> 3 "dup_check3(fc_test1, attr(fc_… 2.62ms 2.75ms 359. NA 3.40
#> 4 "dup_check4(fc_test1, attr(fc_… 2.61ms 2.73ms 355. NA 3.40
fc_test_bad1 <- rbind(fc_test1, fc_test1[30,])
dup_check4(fc_test_bad1, character())
#> [1] TRUE
bench::mark(
dup_check1(fc_test_bad1, attr(fc_test1, "metadata")$other_keys),
dup_check2(fc_test_bad1, attr(fc_test1, "metadata")$other_keys),
dup_check3(fc_test_bad1, attr(fc_test1, "metadata")$other_keys),
dup_check4(fc_test_bad1, attr(fc_test1, "metadata")$other_keys),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(fc_test_bad1, attr… 7.67ms 7.84ms 123. NA 3.10
#> 2 "dup_check2(fc_test_bad1, attr… 305.54µs 323.89µs 3089. NA 6.50
#> 3 "dup_check3(fc_test_bad1, attr… 2.63ms 2.76ms 360. NA 3.40
#> 4 "dup_check4(fc_test_bad1, attr… 2.62ms 2.73ms 365. NA 3.40
packed_key_ex1 <- cases_deaths_subset %>%
mutate(geo_value = tibble(country = "us", state = .env$.data$geo_value))
dup_check4(packed_key_ex1, character())
#> [1] FALSE
bench::mark(
dup_check1(packed_key_ex1, attr(packed_key_ex1, "metadata")$other_keys),
dup_check2(packed_key_ex1, attr(packed_key_ex1, "metadata")$other_keys),
dup_check3(packed_key_ex1, attr(packed_key_ex1, "metadata")$other_keys),
dup_check4(packed_key_ex1, attr(packed_key_ex1, "metadata")$other_keys),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(packed_key_ex1, at… 37.65ms 41.21ms 23.8 NA 5.44
#> 2 "dup_check2(packed_key_ex1, at… 174.24ms 176.55ms 5.66 NA 42.5
#> 3 "dup_check3(packed_key_ex1, at… 3.19ms 3.31ms 294. NA 4.47
#> 4 "dup_check4(packed_key_ex1, at… 2.85ms 3.16ms 314. NA 3.98
packed_key_bad1 <- cases_deaths_subset %>%
mutate(geo_value = tibble(country = "us", state = .env$.data$geo_value)) %>%
rbind({.})
dup_check4(packed_key_bad1, character())
#> [1] TRUE
bench::mark(
dup_check1(packed_key_bad1, attr(packed_key_bad1, "metadata")$other_keys),
dup_check2(packed_key_bad1, attr(packed_key_bad1, "metadata")$other_keys),
dup_check3(packed_key_bad1, attr(packed_key_bad1, "metadata")$other_keys),
dup_check4(packed_key_bad1, attr(packed_key_bad1, "metadata")$other_keys),
min_time = 3
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "dup_check1(packed_key_bad1, a… 43.81ms 45.7ms 21.6 NA 4.99
#> 2 "dup_check2(packed_key_bad1, a… 332.53ms 339.56ms 2.93 NA 5.86
#> 3 "dup_check3(packed_key_bad1, a… 3.56ms 3.88ms 227. NA 6.66
#> 4 "dup_check4(packed_key_bad1, a… 3.43ms 3.72ms 266. NA 6.66
test_tbl2 <- test_tbl[1:1000,]
bench::mark(
dup_check1(test_tbl2, character()),
dup_check2(test_tbl2, character()),
dup_check3(test_tbl2, character()),
dup_check4(test_tbl2, character()),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl2, character(… 10.68ms 11.32ms 88.1 NA 4.62
#> 2 dup_check2(test_tbl2, character(… 3.54ms 3.78ms 268. NA 8.57
#> 3 dup_check3(test_tbl2, character(… 2.02ms 2.16ms 459. NA 3.05
#> 4 dup_check4(test_tbl2, character(… 2ms 2.14ms 469. NA 3.40
test_tbl2 <- test_tbl[1:56,]
bench::mark(
dup_check1(test_tbl2, character()),
dup_check2(test_tbl2, character()),
dup_check3(test_tbl2, character()),
dup_check4(test_tbl2, character()),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl2, characte… 4.28ms 4.57ms 219. NA 3.43
#> 2 dup_check2(test_tbl2, characte… 225.07µs 259.12µs 3853. NA 7.33
#> 3 dup_check3(test_tbl2, characte… 1.95ms 2.09ms 482. NA 3.40
#> 4 dup_check4(test_tbl2, characte… 1.95ms 2.09ms 480. NA 3.05
test_tbl3 <- test_tbl[1:200,]
bench::mark(
dup_check1(test_tbl3, character()),
dup_check2(test_tbl3, character()),
dup_check3(test_tbl3, character()),
dup_check4(test_tbl3, character()),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl3, characte… 5.23ms 5.66ms 175. NA 3.80
#> 2 dup_check2(test_tbl3, characte… 740.34µs 801.31µs 1252. NA 7.94
#> 3 dup_check3(test_tbl3, characte… 2.03ms 2.14ms 459. NA 3.06
#> 4 dup_check4(test_tbl3, characte… 2ms 2.1ms 472. NA 3.40
test_tbl4 <- test_tbl[1:500,]
bench::mark(
dup_check1(test_tbl4, character()),
dup_check2(test_tbl4, character()),
dup_check3(test_tbl4, character()),
dup_check4(test_tbl4, character()),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl4, character()) 7.33ms 7.82ms 126. NA 4.03
#> 2 dup_check2(test_tbl4, character()) 1.77ms 1.91ms 521. NA 8.35
#> 3 dup_check3(test_tbl4, character()) 2.04ms 2.15ms 449. NA 3.07
#> 4 dup_check4(test_tbl4, character()) 1.98ms 2.12ms 465. NA 3.06
test_tbl5 <- test_tbl[1:1000,]
bench::mark(
dup_check1(test_tbl5, character()),
dup_check2(test_tbl5, character()),
dup_check3(test_tbl5, character()),
dup_check4(test_tbl5, character()),
min_time = 3
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 dup_check1(test_tbl5, character(… 11.14ms 11.5ms 85.2 NA 4.63
#> 2 dup_check2(test_tbl5, character(… 3.52ms 3.79ms 265. NA 8.54
#> 3 dup_check3(test_tbl5, character(… 2.02ms 2.15ms 463. NA 3.06
#> 4 dup_check4(test_tbl5, character(… 2ms 2.14ms 469. NA 3.40 Created on 2024-11-25 with reprex v2.1.1 |
Memory results: Option 2 is always uses the least memory by a large factor. Option 3 sometimes uses less memory than option 1, option 4 always does (how much less varies).
|
Thanks for checking on the memory usage! I did suspect 2 would be better on memory usage... One key copy is probably avoidable by working off the original number of rows and binding generalized-dplyr-lag. More from native code but compilation's a bad idea to introduce at this moment / maybe ever. I'm not sure how much this needs optimized time or memory-wise. I think I noticed some long construction times so thought I would just throw this in. It still probably isn't fast enough to spam it in our hacky |
I'm considering trying to improve performance of duplicate key detection in library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
library(epiprocess)
#> Loading required package: epidatasets
#> Registered S3 method overwritten by 'tsibble':
#> method from
#> as_tibble.grouped_df dplyr
#>
#> Attaching package: 'epiprocess'
#> The following object is masked from 'package:stats':
#>
#> filter
x <- archive_cases_dv_subset
dup_check4 <- function(x) {
if (nrow(x) <= 1L) {
FALSE
} else {
ukey_names <- key(x)
arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
any(vctrs::vec_equal(arranged_ukeys[-1L,], arranged_ukeys[-nrow(arranged_ukeys),]))
}
}
DT_as_tbl <- as_tibble(archive_cases_dv_subset$DT)
benchmark <- bench::mark(
as.logical(anyDuplicated(DT_as_tbl, by = key(DT))),
as.logical(anyDuplicated(x$DT, by = key(x$DT))),
dup_check4(x$DT)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
benchmark
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl>
#> 1 as.logical(anyDuplicated(DT_as_tbl… 1.31s 1.31s 0.762 2MB 12.2
#> 2 as.logical(anyDuplicated(x$DT, by … 3.23ms 3.52ms 273. 1.5MB 7.96
#> 3 dup_check4(x$DT) 3.02ms 3.18ms 293. 4.19MB 3.98 Created on 2025-01-23 with reprex v2.1.1 This might motivate us to consider a data.table-based approach in Second, some questions/issues with profiling. From
There's also |
When writing some code for archive to archive slides,
as_epi_df
was taking most of the time. I can/should probably avoid that withnew_epi_df
or anas_epi_df.data.table
, but it'd probably still be nice to speed this up in case we/users want to have the convenience/security ofas_epi_df
.Most of the time in

as_epi_df
appears to be spent in duplicate detection:Here's some limited testing on duplicate check approaches; looks like we can speed duplicate checks up by >50x, for "medium"-sized inputs at least.
Created on 2024-10-31 with reprex v2.1.1
vctrs::vec_equal
should keep this pretty general, though I don't know how it compares to less general approaches speed-wise.I'm not immediately PR-ing this because it probably needs a bit more correctness and performance testing on different sizes.
The text was updated successfully, but these errors were encountered: