Add bounds argument to geom_density() (#4013)

echasnovski · web-flow · commit f6b383369b34 · 2022-07-26T15:51:57.000-05:00
* Draft update `StatDensity` to have `bounds` argument.

* Refresh `bounds` usage in `StatDensity`.

* Handle data points outside of `bounds`.

* Update documentation with `bounds` example.

* Use `cli::cli_warn()` instead of `warn()`.

* Add tests for `stat_density()`.

* Update tests for `stat_density()`.

* Update 'NEWS.md'.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # ggplot2 (development version)
 
+* `geom_density()` and `stat_density()` now support `bounds` argument
+  to estimate density with boundary correction (@echasnovski, #4013).
+
 * ggplot now checks during statistical transformations whether any data 
   columns were dropped and warns about this. If stats intend to drop
   data columns they can declare them in the new field `dropped_aes`.
diff --git a/R/geom-density.r b/R/geom-density.r
@@ -35,6 +35,12 @@
 #'   geom_density(alpha = 0.1) +
 #'   xlim(55, 70)
 #'
+#' # Use `bounds` to adjust computation for known data limits
+#' big_diamonds <- diamonds[diamonds$carat >= 1, ]
+#' ggplot(big_diamonds, aes(carat)) +
+#'   geom_density(color = 'red') +
+#'   geom_density(bounds = c(1, Inf), color = 'blue')
+#'
 #' \donttest{
 #' # Stacked density plots: if you want to create a stacked density plot, you
 #' # probably want to 'count' (density * n) variable instead of the default
diff --git a/R/stat-density.r b/R/stat-density.r
@@ -15,6 +15,11 @@
 #'   not line-up, and hence you won't be able to stack density values.
 #'   This parameter only matters if you are displaying multiple densities in
 #'   one plot or if you are manually adjusting the scale limits.
+#' @param bounds Known lower and upper bounds for estimated data. Default
+#'   `c(-Inf, Inf)` means that there are no (finite) bounds. If any bound is
+#'   finite, boundary effect of default density estimation will be corrected by
+#'   reflecting tails outside `bounds` around their closest edge. Data points
+#'   outside of bounds are removed with a warning.
 #' @section Computed variables:
 #' \describe{
 #'   \item{density}{density estimate}
@@ -36,6 +41,7 @@ stat_density <- function(mapping = NULL, data = NULL,
                          n = 512,
                          trim = FALSE,
                          na.rm = FALSE,
+                         bounds = c(-Inf, Inf),
                          orientation = NA,
                          show.legend = NA,
                          inherit.aes = TRUE) {
@@ -55,6 +61,7 @@ stat_density <- function(mapping = NULL, data = NULL,
       n = n,
       trim = trim,
       na.rm = na.rm,
+      bounds = bounds,
       orientation = orientation,
       ...
     )
@@ -70,6 +77,8 @@ StatDensity <- ggproto("StatDensity", Stat,
 
   default_aes = aes(x = after_stat(density), y = after_stat(density), fill = NA, weight = NULL),
 
+  dropped_aes = "weight",
+
   setup_params = function(self, data, params) {
     params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE)
 
@@ -85,7 +94,8 @@ StatDensity <- ggproto("StatDensity", Stat,
   extra_params = c("na.rm", "orientation"),
 
   compute_group = function(data, scales, bw = "nrd0", adjust = 1, kernel = "gaussian",
-                           n = 512, trim = FALSE, na.rm = FALSE, flipped_aes = FALSE) {
+                           n = 512, trim = FALSE, na.rm = FALSE, bounds = c(-Inf, Inf),
+                           flipped_aes = FALSE) {
     data <- flip_data(data, flipped_aes)
     if (trim) {
       range <- range(data$x, na.rm = TRUE)
@@ -94,22 +104,30 @@ StatDensity <- ggproto("StatDensity", Stat,
     }
 
     density <- compute_density(data$x, data$weight, from = range[1],
-      to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n)
+      to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n,
+      bounds = bounds)
     density$flipped_aes <- flipped_aes
     flip_data(density, flipped_aes)
   }
 
 )
 
 compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
-                            kernel = "gaussian", n = 512) {
+                            kernel = "gaussian", n = 512,
+                            bounds = c(-Inf, Inf)) {
   nx <- length(x)
   if (is.null(w)) {
     w <- rep(1 / nx, nx)
   } else {
     w <- w / sum(w)
   }
 
+  # Adjust data points and weights to all fit inside bounds
+  sample_data <- fit_data_to_bounds(bounds, x, w)
+  x <- sample_data$x
+  w <- sample_data$w
+  nx <- length(x)
+
   # if less than 2 points return data frame of NAs and a warning
   if (nx < 2) {
     cli::cli_warn("Groups with fewer than two data points have been dropped.")
@@ -124,8 +142,16 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
     ))
   }
 
-  dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
-    kernel = kernel, n = n, from = from, to = to)
+  # Decide whether to use boundary correction
+  if (any(is.finite(bounds))) {
+    dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
+                           kernel = kernel, n = n)
+
+    dens <- reflect_density(dens = dens, bounds = bounds, from = from, to = to)
+  } else {
+    dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
+                           kernel = kernel, n = n, from = from, to = to)
+  }
 
   data_frame0(
     x = dens$x,
@@ -137,3 +163,57 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
     .size = length(dens$x)
   )
 }
+
+# Check if all data points are inside bounds. If not, warn and remove them.
+fit_data_to_bounds <- function(bounds, x, w) {
+  is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2])
+
+  if (any(!is_inside_bounds)) {
+    cli::cli_warn("Some data points are outside of `bounds`. Removing them.")
+    x <- x[is_inside_bounds]
+    w <- w[is_inside_bounds]
+    w_sum <- sum(w)
+    if (w_sum > 0) {
+      w <- w / w_sum
+    }
+  }
+
+  return(list(x = x, w = w))
+}
+
+# Update density estimation to mitigate boundary effect at known `bounds`:
+# - All x values will lie inside `bounds`.
+# - All y-values will be updated to have total probability of `bounds` be
+#   closer to 1. This is done by reflecting tails outside of `bounds` around
+#   their closest edge. This leads to those tails lie inside of `bounds`
+#   (completely, if they are not wider than `bounds` itself, which is a common
+#   situation) and correct boundary effect of default density estimation.
+#
+# `dens` - output of `stats::density`.
+# `bounds` - two-element vector with left and right known (user supplied)
+#   bounds of x values.
+# `from`, `to` - numbers used as corresponding arguments of `stats::density()`
+#   in case of no boundary correction.
+reflect_density <- function(dens, bounds, from, to) {
+  # No adjustment is needed if no finite bounds are supplied
+  if (all(is.infinite(bounds))) {
+    return(dens)
+  }
+
+  # Estimate linearly with zero tails (crucial to account for infinite bound)
+  f_dens <- stats::approxfun(
+    x = dens$x, y = dens$y, method = "linear", yleft = 0, yright = 0
+  )
+
+  # Create a uniform x-grid inside `bounds`
+  left <- max(from, bounds[1])
+  right <- min(to, bounds[2])
+  out_x <- seq(from = left, to = right, length.out = length(dens$x))
+
+  # Update density estimation by adding reflected tails from outside `bounds`
+  left_reflection <- f_dens(bounds[1] + (bounds[1] - out_x))
+  right_reflection <- f_dens(bounds[2] + (bounds[2] - out_x))
+  out_y <- f_dens(out_x) + left_reflection + right_reflection
+
+  list(x = out_x, y = out_y)
+}
diff --git a/man/geom_density.Rd b/man/geom_density.Rd
diff --git a/tests/testthat/test-stat-density.R b/tests/testthat/test-stat-density.R
@@ -1,3 +1,94 @@
+test_that("stat_density actually computes density", {
+  # Compare functon approximations because outputs from `ggplot()` and
+  # `density()` give grids spanning different ranges
+  dens <- stats::density(mtcars$mpg)
+  expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))
+
+  plot <- ggplot(mtcars, aes(mpg)) + stat_density()
+  actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])
+
+  test_sample <- unique(mtcars$mpg)
+  expect_equal(
+    expected_density_fun(test_sample),
+    actual_density_fun(test_sample),
+    tolerance = 1e-3
+  )
+})
+
+test_that("stat_density can make weighted density estimation", {
+  df <- mtcars
+  df$weight <- mtcars$cyl / sum(mtcars$cyl)
+
+  dens <- stats::density(df$mpg, weights = df$weight)
+  expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))
+
+  plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density()
+  actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])
+
+  test_sample <- unique(df$mpg)
+  expect_equal(
+    expected_density_fun(test_sample),
+    actual_density_fun(test_sample),
+    tolerance = 1e-3
+  )
+})
+
+test_that("stat_density uses `bounds`", {
+  mpg_min <- min(mtcars$mpg)
+  mpg_max <- max(mtcars$mpg)
+
+  expect_bounds <- function(bounds) {
+    dens <- stats::density(mtcars$mpg)
+    orig_density <- stats::approxfun(
+      data.frame(x = dens$x, y = dens$y),
+      yleft = 0,
+      yright = 0
+    )
+
+    bounded_plot <- ggplot(mtcars, aes(mpg)) + stat_density(bounds = bounds)
+    bounded_data <- layer_data(bounded_plot)[, c("x", "y")]
+    plot_density <- stats::approxfun(bounded_data, yleft = 0, yright = 0)
+
+    test_sample <- seq(mpg_min, mpg_max, by = 0.1)
+    left_reflection <- orig_density(bounds[1] + (bounds[1] - test_sample))
+    right_reflection <- orig_density(bounds[2] + (bounds[2] - test_sample))
+
+    # Plot density should be an original plus added reflection at both `bounds`
+    # (reflection around infinity is zero)
+    expect_equal(
+      orig_density(test_sample) + left_reflection + right_reflection,
+      plot_density(test_sample),
+      tolerance = 1e-4
+    )
+  }
+
+  expect_bounds(c(-Inf, Inf))
+  expect_bounds(c(mpg_min, Inf))
+  expect_bounds(c(-Inf, mpg_max))
+  expect_bounds(c(mpg_min, mpg_max))
+})
+
+test_that("stat_density handles data outside of `bounds`", {
+  cutoff <- mtcars$mpg[1]
+
+  # Both `x` and `weight` should be filtered out for out of `bounds` points
+  expect_warning(
+    data_actual <- layer_data(
+      ggplot(mtcars, aes(mpg, weight = cyl)) +
+        stat_density(bounds = c(cutoff, Inf))
+    ),
+    "outside of `bounds`"
+  )
+
+  mtcars_filtered <- mtcars[mtcars$mpg >= cutoff, ]
+  data_expected <- layer_data(
+    ggplot(mtcars_filtered, aes(mpg, weight = cyl)) +
+      stat_density(bounds = c(cutoff, Inf))
+  )
+
+  expect_equal(data_actual, data_expected)
+})
+
 test_that("compute_density succeeds when variance is zero", {
   dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5)
   expect_equal(dens$n, rep(10, 512))