From 7fc2f14de4058326a766766c079bc56d91c9ef34 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 24 Feb 2022 22:38:10 -0500 Subject: [PATCH 1/2] minor bug fixes to outlier detection --- DESCRIPTION | 2 + NAMESPACE | 1 + R/outliers.R | 24 +- man/detect_outlr.Rd | 2 +- tests/testthat.R | 4 + tests/testthat/test-outliers.R | 7 + vignettes/outliers.Rmd | 3 +- vignettes/outliers.html | 520 +++++++++++++++++++++++++++++++++ 8 files changed, 552 insertions(+), 11 deletions(-) create mode 100644 tests/testthat.R create mode 100644 tests/testthat/test-outliers.R create mode 100644 vignettes/outliers.html diff --git a/DESCRIPTION b/DESCRIPTION index 97577506..9d3ce499 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,6 +51,8 @@ Imports: tidyr, tsibble Suggests: + testthat (>= 3.0.0), delphi.epidata Remotes: github::cmu-delphi/delphi-epidata-r +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index b077342e..78363e2b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,4 +64,5 @@ importFrom(stats,cor) importFrom(stats,median) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) +importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) diff --git a/R/outliers.R b/R/outliers.R index f12b99e5..9bd370dc 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -45,12 +45,15 @@ #' #' @export detect_outlr = function(x = seq_along(y), y, - methods = tibble(method = "rm", + methods = tibble::tibble(method = "rm", args = list(list()), abbr = "rm"), combiner = c("median", "mean", "none")) { # Validate combiner combiner = match.arg(combiner) + + # Validate that x contains all distinct values + if (max(table(x)) > 1) Abort("`x` must not contain duplicate values; did you group your `epi_df` by all relevant key variables?") # Run all outlier detection methods results = purrr::pmap_dfc(methods, function(method, args, abbr) { @@ -187,6 +190,7 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, #' description. #' #' @importFrom stats median +#' @importFrom tidyselect starts_with #' @export detect_outlr_stl = function(x = seq_along(y), y, n_trend = 21, @@ -216,11 +220,10 @@ detect_outlr_stl = function(x = seq_along(y), y, fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>% generics::components() %>% tibble::as_tibble() %>% - dplyr::transmute( - trend = trend, - seasonal = season_week, - resid = remainder) - + dplyr::select(trend:remainder) %>% + dplyr::rename_with(~ "seasonal", tidyselect::starts_with("season")) %>% + dplyr::rename(resid = remainder) + # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { stl_components = stl_components %>% @@ -263,15 +266,18 @@ detect_outlr_stl = function(x = seq_along(y), y, # Common function for rolling IQR, using fitted and resid variables roll_iqr = function(z, n, detection_multiplier, min_radius, - replacement_multiplier, min_lower) { + replacement_multiplier, min_lower) { + if (typeof(z$y) == "integer") as_type = as.integer + else as_type = as.numeric + epi_slide(z, roll_iqr = IQR(resid), n = n, align = "center") %>% dplyr::mutate( lower = pmax(min_lower, fitted - pmax(min_radius, detection_multiplier * roll_iqr)), upper = fitted + pmax(min_radius, detection_multiplier * roll_iqr), replacement = dplyr::case_when( - (y < lower) ~ fitted - replacement_multiplier * roll_iqr, - (y > upper) ~ fitted + replacement_multiplier * roll_iqr, + (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr), + (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr), TRUE ~ y)) %>% dplyr::select(lower, upper, replacement) %>% tibble::as_tibble() diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 4a3c24c0..d832cd6d 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -7,7 +7,7 @@ detect_outlr( x = seq_along(y), y, - methods = tibble(method = "rm", args = list(list()), abbr = "rm"), + methods = tibble::tibble(method = "rm", args = list(list()), abbr = "rm"), combiner = c("median", "mean", "none") ) } diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 00000000..b26d274b --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(epiprocess) + +test_check("epiprocess") diff --git a/tests/testthat/test-outliers.R b/tests/testthat/test-outliers.R new file mode 100644 index 00000000..acdc2a35 --- /dev/null +++ b/tests/testthat/test-outliers.R @@ -0,0 +1,7 @@ +test_that("detect_outlr throws error with duplicate x", { + expect_error(detect_outlr(x = c(1, 2, 3, 3, 4), y = 1:5)) +}) + +test_that("detect_outlr throws error with length(x) != length(y)", { + expect_error(detect_outlr(x = 1:3, y = 1:5)) +}) diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 1f91f17d..22bae575 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -75,7 +75,8 @@ detection_methods = bind_rows( abbr = "rm"), tibble(method = "stl", args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5)), + detection_multiplier = 2.5, + seasonal_period = 7)), abbr = "stl_seasonal"), tibble(method = "stl", args = list(list(detect_negatives = TRUE, diff --git a/vignettes/outliers.html b/vignettes/outliers.html new file mode 100644 index 00000000..3770d380 --- /dev/null +++ b/vignettes/outliers.html @@ -0,0 +1,520 @@ + + + + + + + + + + + + + + +Detect and correct outliers in signals + + + + + + + + + + + + + + + + + + + + + + + + + +

Detect and correct outliers in signals

+ + + +

This vignette describes functionality for detecting and correcting outliers in signals in the detect_outlr() and correct_outlr() functions provided in the epiprocess package. These functions is designed to be modular and extendable, so that you can define your own outlier detection and correction routines and apply them to epi_df objects. We’ll demonstrate this using state-level daily reported COVID-19 case counts from FL and NJ.

+
library(covidcast)
+library(epiprocess)
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+theme_set(theme_bw())
+
+x <- covidcast_signal("jhu-csse",
+                      "confirmed_incidence_num",
+                      start_day = "2020-06-01",
+                      end_day = "2021-05-31",
+                      geo_type = "state",
+                      geo_values = c("fl", "nj"),
+                      as_of = "2021-10-28") %>%
+  select(geo_value, time_value, cases = value) %>%
+  as_epi_df()
+
+ggplot(x, aes(x = time_value, y = cases)) +
+  geom_line() +
+  geom_hline(yintercept = 0, linetype = 3) +
+  facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
+  labs(x = "Date", y = "Reported COVID-19 counts")
+

+

There are multiple outliers in these data that a modeler may want to detect and correct. We’ll discuss those two tasks in turn.

+
+

Outlier detection

+

The detect_outlr() function allows us to run multiple outlier detection methods on a given signal, and then (optionally) combine the results from those methods. Here, we’ll investigate outlier detection results from the following methods.

+
    +
  1. Detection based on a rolling median, using detect_outlr_rm(), which computes a rolling median on with a default window size of n time points centered at the time point under consideration, and then computes thresholds based on a multiplier times a rolling IQR computed on the residuals.
  2. +
  3. Detection based on a seasonal-trend decomposition using LOESS (STL), using detect_outlr_stl(), which is similar to the rolling median method but replaces the rolling median with fitted values from STL.
  4. +
  5. Detection based on an STL decomposition, but without seasonality term, which amounts to smoothing using LOESS.
  6. +
+

The outlier detection methods are specified using a tibble that is passed to detect_outlr(), with one row per method, and whose columms specify the outlier detection function, any input arguments (only nondefault values need to be supplied), and an abbreviated name for the method used in tracking results. Abbreviations “rm” and “stl” can be used for the built-in detection functions detect_outlr_rm() and detect_outlr_stl(), respectively.

+
detection_methods = bind_rows(
+  tibble(method = "rm",
+         args = list(list(detect_negatives = TRUE,
+                          detection_multiplier = 2.5)),
+         abbr = "rm"),
+  tibble(method = "stl",
+         args = list(list(detect_negatives = TRUE,
+                          detection_multiplier = 2.5,
+                          seasonal_period = 7)),
+         abbr = "stl_seasonal"),
+  tibble(method = "stl",
+         args = list(list(detect_negatives = TRUE,
+                          detection_multiplier = 2.5,
+                          seasonal_period = NULL)),
+         abbr = "stl_nonseasonal"))
+
+detection_methods
+
## # A tibble: 3 × 3
+##   method args             abbr           
+##   <chr>  <list>           <chr>          
+## 1 rm     <named list [2]> rm             
+## 2 stl    <named list [3]> stl_seasonal   
+## 3 stl    <named list [3]> stl_nonseasonal
+

Additionally, we’ll form combined lower and upper thresholds, calculated as the median of the lower and upper thresholds from the methods at each time point. Note that using this combined median threshold is equivalent to using a majority vote across the base methods to determine whether a value is an outlier.

+
x <- x %>%
+  group_by(geo_value) %>%
+  mutate(outlier_info  = detect_outlr(
+    x = time_value, y = cases,
+    methods = detection_methods,
+    combiner = "median")) %>%
+  unnest(outlier_info)
+
+head(x)
+
## # A tibble: 6 × 15
+##   geo_value time_value cases rm_lower rm_upper rm_replacement stl_seasonal_lower
+##   <chr>     <date>     <dbl>    <dbl>    <dbl>          <dbl>              <dbl>
+## 1 fl        2020-06-01   667    345      2195             667                 0 
+## 2 nj        2020-06-01   486     64.4     926.            486               221.
+## 3 fl        2020-06-02   617    406.     2169.            617                 0 
+## 4 nj        2020-06-02   658    140.      841.            658               245.
+## 5 fl        2020-06-03  1317    468.     2142.           1317                 0 
+## 6 nj        2020-06-03   541    216       756             541               227.
+## # … with 8 more variables: stl_seasonal_upper <dbl>,
+## #   stl_seasonal_replacement <dbl>, stl_nonseasonal_lower <dbl>,
+## #   stl_nonseasonal_upper <dbl>, stl_nonseasonal_replacement <dbl>,
+## #   combined_lower <dbl>, combined_upper <dbl>, combined_replacement <dbl>
+

To visualize the results, we first define a convenience function for plotting.

+
# Plot outlier detection bands and/or points identified as outliers
+plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, 
+                       facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,
+                       scales = "fixed") {
+  # Convert outlier detection results to long format 
+  signal <- rlang::enquo(signal)
+  x_long <- x %>%
+    pivot_longer(
+      cols = starts_with(method_abbr),
+      names_to = c("method", ".value"),
+      names_pattern = "(.+)_(.+)")
+  
+  # Start of plot with observed data
+  p <- ggplot() +
+    geom_line(data = x, mapping = aes(x = time_value, y = !!signal))
+
+  # If requested, add bands
+  if (bands) 
+    p <- p + geom_ribbon(data = x_long, 
+                         aes(x = time_value, ymin = lower, ymax = upper, 
+                             color = method), fill = NA)
+
+  # If requested, add points
+  if (points) {
+    x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))
+    p <- p + geom_point(data = x_detected, 
+                        aes(x = time_value, y = !!signal, color = method, 
+                            shape = method))
+  }
+
+  # If requested, add faceting
+  if (!is.null(facet_vars)) 
+    p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)
+
+  return(p)
+}
+

Now we produce plots for each state at a time, faceting by the detection method.

+
method_abbr <- c(detection_methods$abbr, "combined")
+
+plot_outlr(x %>% filter(geo_value == "fl"), cases, method_abbr,
+           facet_vars = vars(method), scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
+  labs(x = "Date", y = "Reported COVID-19 counts", color  = "Method",
+       shape = "Method")
+

+
plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr,
+           facet_vars = vars(method), scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
+  labs(x = "Date", y = "Reported COVID-19 counts", color  = "Method",
+       shape = "Method")
+

+
+
+

Outlier correction

+

Finally, in order to correct outliers, we can use the posited replacement values returned by each outlier detection method. Below we use the replacement value from the combined method, which is defined by the median of replacement values from the base methods at each time point.

+
y <- x %>% 
+  mutate(cases_corrected = combined_replacement) %>%
+  select(geo_value, time_value, cases, cases_corrected) 
+
+y %>% filter(cases != cases_corrected)
+
## # A tibble: 22 × 4
+## # Groups:   geo_value [2]
+##    geo_value time_value cases cases_corrected
+##    <chr>     <date>     <dbl>           <dbl>
+##  1 fl        2020-07-12 15300          10181 
+##  2 nj        2020-07-19    -8            320.
+##  3 nj        2020-08-13   694            404.
+##  4 nj        2020-08-14   619            397.
+##  5 nj        2020-08-16    40            366 
+##  6 nj        2020-08-22   555            360 
+##  7 fl        2020-09-01  7569           2861.
+##  8 nj        2020-10-08  1415            873.
+##  9 fl        2020-10-10     0           2660 
+## 10 fl        2020-10-11  5570           2660 
+## # … with 12 more rows
+
ggplot(y, aes(x = time_value)) +
+  geom_line(aes(y = cases), linetype = 2) +
+  geom_line(aes(y = cases_corrected), col = 2) +
+  geom_hline(yintercept = 0, linetype = 3) +
+  facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
+  labs(x = "Date", y = "Reported COVID-19 counts")
+

+

More advanced correction functionality will be coming at some point in the future.

+
+ + + + + + + + + + + From 89fdba177e7a73e77a5600fecf7909aa468d2858 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 24 Feb 2022 22:53:51 -0500 Subject: [PATCH 2/2] Delete outliers.html --- vignettes/outliers.html | 520 ---------------------------------------- 1 file changed, 520 deletions(-) delete mode 100644 vignettes/outliers.html diff --git a/vignettes/outliers.html b/vignettes/outliers.html deleted file mode 100644 index 3770d380..00000000 --- a/vignettes/outliers.html +++ /dev/null @@ -1,520 +0,0 @@ - - - - - - - - - - - - - - -Detect and correct outliers in signals - - - - - - - - - - - - - - - - - - - - - - - - - -

Detect and correct outliers in signals

- - - -

This vignette describes functionality for detecting and correcting outliers in signals in the detect_outlr() and correct_outlr() functions provided in the epiprocess package. These functions is designed to be modular and extendable, so that you can define your own outlier detection and correction routines and apply them to epi_df objects. We’ll demonstrate this using state-level daily reported COVID-19 case counts from FL and NJ.

-
library(covidcast)
-library(epiprocess)
-library(dplyr)
-library(tidyr)
-library(ggplot2)
-theme_set(theme_bw())
-
-x <- covidcast_signal("jhu-csse",
-                      "confirmed_incidence_num",
-                      start_day = "2020-06-01",
-                      end_day = "2021-05-31",
-                      geo_type = "state",
-                      geo_values = c("fl", "nj"),
-                      as_of = "2021-10-28") %>%
-  select(geo_value, time_value, cases = value) %>%
-  as_epi_df()
-
-ggplot(x, aes(x = time_value, y = cases)) +
-  geom_line() +
-  geom_hline(yintercept = 0, linetype = 3) +
-  facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
-  labs(x = "Date", y = "Reported COVID-19 counts")
-

-

There are multiple outliers in these data that a modeler may want to detect and correct. We’ll discuss those two tasks in turn.

-
-

Outlier detection

-

The detect_outlr() function allows us to run multiple outlier detection methods on a given signal, and then (optionally) combine the results from those methods. Here, we’ll investigate outlier detection results from the following methods.

-
    -
  1. Detection based on a rolling median, using detect_outlr_rm(), which computes a rolling median on with a default window size of n time points centered at the time point under consideration, and then computes thresholds based on a multiplier times a rolling IQR computed on the residuals.
  2. -
  3. Detection based on a seasonal-trend decomposition using LOESS (STL), using detect_outlr_stl(), which is similar to the rolling median method but replaces the rolling median with fitted values from STL.
  4. -
  5. Detection based on an STL decomposition, but without seasonality term, which amounts to smoothing using LOESS.
  6. -
-

The outlier detection methods are specified using a tibble that is passed to detect_outlr(), with one row per method, and whose columms specify the outlier detection function, any input arguments (only nondefault values need to be supplied), and an abbreviated name for the method used in tracking results. Abbreviations “rm” and “stl” can be used for the built-in detection functions detect_outlr_rm() and detect_outlr_stl(), respectively.

-
detection_methods = bind_rows(
-  tibble(method = "rm",
-         args = list(list(detect_negatives = TRUE,
-                          detection_multiplier = 2.5)),
-         abbr = "rm"),
-  tibble(method = "stl",
-         args = list(list(detect_negatives = TRUE,
-                          detection_multiplier = 2.5,
-                          seasonal_period = 7)),
-         abbr = "stl_seasonal"),
-  tibble(method = "stl",
-         args = list(list(detect_negatives = TRUE,
-                          detection_multiplier = 2.5,
-                          seasonal_period = NULL)),
-         abbr = "stl_nonseasonal"))
-
-detection_methods
-
## # A tibble: 3 × 3
-##   method args             abbr           
-##   <chr>  <list>           <chr>          
-## 1 rm     <named list [2]> rm             
-## 2 stl    <named list [3]> stl_seasonal   
-## 3 stl    <named list [3]> stl_nonseasonal
-

Additionally, we’ll form combined lower and upper thresholds, calculated as the median of the lower and upper thresholds from the methods at each time point. Note that using this combined median threshold is equivalent to using a majority vote across the base methods to determine whether a value is an outlier.

-
x <- x %>%
-  group_by(geo_value) %>%
-  mutate(outlier_info  = detect_outlr(
-    x = time_value, y = cases,
-    methods = detection_methods,
-    combiner = "median")) %>%
-  unnest(outlier_info)
-
-head(x)
-
## # A tibble: 6 × 15
-##   geo_value time_value cases rm_lower rm_upper rm_replacement stl_seasonal_lower
-##   <chr>     <date>     <dbl>    <dbl>    <dbl>          <dbl>              <dbl>
-## 1 fl        2020-06-01   667    345      2195             667                 0 
-## 2 nj        2020-06-01   486     64.4     926.            486               221.
-## 3 fl        2020-06-02   617    406.     2169.            617                 0 
-## 4 nj        2020-06-02   658    140.      841.            658               245.
-## 5 fl        2020-06-03  1317    468.     2142.           1317                 0 
-## 6 nj        2020-06-03   541    216       756             541               227.
-## # … with 8 more variables: stl_seasonal_upper <dbl>,
-## #   stl_seasonal_replacement <dbl>, stl_nonseasonal_lower <dbl>,
-## #   stl_nonseasonal_upper <dbl>, stl_nonseasonal_replacement <dbl>,
-## #   combined_lower <dbl>, combined_upper <dbl>, combined_replacement <dbl>
-

To visualize the results, we first define a convenience function for plotting.

-
# Plot outlier detection bands and/or points identified as outliers
-plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, 
-                       facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,
-                       scales = "fixed") {
-  # Convert outlier detection results to long format 
-  signal <- rlang::enquo(signal)
-  x_long <- x %>%
-    pivot_longer(
-      cols = starts_with(method_abbr),
-      names_to = c("method", ".value"),
-      names_pattern = "(.+)_(.+)")
-  
-  # Start of plot with observed data
-  p <- ggplot() +
-    geom_line(data = x, mapping = aes(x = time_value, y = !!signal))
-
-  # If requested, add bands
-  if (bands) 
-    p <- p + geom_ribbon(data = x_long, 
-                         aes(x = time_value, ymin = lower, ymax = upper, 
-                             color = method), fill = NA)
-
-  # If requested, add points
-  if (points) {
-    x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))
-    p <- p + geom_point(data = x_detected, 
-                        aes(x = time_value, y = !!signal, color = method, 
-                            shape = method))
-  }
-
-  # If requested, add faceting
-  if (!is.null(facet_vars)) 
-    p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)
-
-  return(p)
-}
-

Now we produce plots for each state at a time, faceting by the detection method.

-
method_abbr <- c(detection_methods$abbr, "combined")
-
-plot_outlr(x %>% filter(geo_value == "fl"), cases, method_abbr,
-           facet_vars = vars(method), scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
-  labs(x = "Date", y = "Reported COVID-19 counts", color  = "Method",
-       shape = "Method")
-

-
plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr,
-           facet_vars = vars(method), scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
-  labs(x = "Date", y = "Reported COVID-19 counts", color  = "Method",
-       shape = "Method")
-

-
-
-

Outlier correction

-

Finally, in order to correct outliers, we can use the posited replacement values returned by each outlier detection method. Below we use the replacement value from the combined method, which is defined by the median of replacement values from the base methods at each time point.

-
y <- x %>% 
-  mutate(cases_corrected = combined_replacement) %>%
-  select(geo_value, time_value, cases, cases_corrected) 
-
-y %>% filter(cases != cases_corrected)
-
## # A tibble: 22 × 4
-## # Groups:   geo_value [2]
-##    geo_value time_value cases cases_corrected
-##    <chr>     <date>     <dbl>           <dbl>
-##  1 fl        2020-07-12 15300          10181 
-##  2 nj        2020-07-19    -8            320.
-##  3 nj        2020-08-13   694            404.
-##  4 nj        2020-08-14   619            397.
-##  5 nj        2020-08-16    40            366 
-##  6 nj        2020-08-22   555            360 
-##  7 fl        2020-09-01  7569           2861.
-##  8 nj        2020-10-08  1415            873.
-##  9 fl        2020-10-10     0           2660 
-## 10 fl        2020-10-11  5570           2660 
-## # … with 12 more rows
-
ggplot(y, aes(x = time_value)) +
-  geom_line(aes(y = cases), linetype = 2) +
-  geom_line(aes(y = cases_corrected), col = 2) +
-  geom_hline(yintercept = 0, linetype = 3) +
-  facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
-  labs(x = "Date", y = "Reported COVID-19 counts")
-

-

More advanced correction functionality will be coming at some point in the future.

-
- - - - - - - - - - -