Skip to content

Commit f6b3833

Browse files
authored
Add bounds argument to geom_density() (#4013)
* Draft update `StatDensity` to have `bounds` argument. * Refresh `bounds` usage in `StatDensity`. * Handle data points outside of `bounds`. * Update documentation with `bounds` example. * Use `cli::cli_warn()` instead of `warn()`. * Add tests for `stat_density()`. * Update tests for `stat_density()`. * Update 'NEWS.md'.
1 parent a979ffd commit f6b3833

File tree

5 files changed

+198
-5
lines changed

5 files changed

+198
-5
lines changed

NEWS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# ggplot2 (development version)
22

3+
* `geom_density()` and `stat_density()` now support `bounds` argument
4+
to estimate density with boundary correction (@echasnovski, #4013).
5+
36
* ggplot now checks during statistical transformations whether any data
47
columns were dropped and warns about this. If stats intend to drop
58
data columns they can declare them in the new field `dropped_aes`.

R/geom-density.r

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@
3535
#' geom_density(alpha = 0.1) +
3636
#' xlim(55, 70)
3737
#'
38+
#' # Use `bounds` to adjust computation for known data limits
39+
#' big_diamonds <- diamonds[diamonds$carat >= 1, ]
40+
#' ggplot(big_diamonds, aes(carat)) +
41+
#' geom_density(color = 'red') +
42+
#' geom_density(bounds = c(1, Inf), color = 'blue')
43+
#'
3844
#' \donttest{
3945
#' # Stacked density plots: if you want to create a stacked density plot, you
4046
#' # probably want to 'count' (density * n) variable instead of the default

R/stat-density.r

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
#' not line-up, and hence you won't be able to stack density values.
1616
#' This parameter only matters if you are displaying multiple densities in
1717
#' one plot or if you are manually adjusting the scale limits.
18+
#' @param bounds Known lower and upper bounds for estimated data. Default
19+
#' `c(-Inf, Inf)` means that there are no (finite) bounds. If any bound is
20+
#' finite, boundary effect of default density estimation will be corrected by
21+
#' reflecting tails outside `bounds` around their closest edge. Data points
22+
#' outside of bounds are removed with a warning.
1823
#' @section Computed variables:
1924
#' \describe{
2025
#' \item{density}{density estimate}
@@ -36,6 +41,7 @@ stat_density <- function(mapping = NULL, data = NULL,
3641
n = 512,
3742
trim = FALSE,
3843
na.rm = FALSE,
44+
bounds = c(-Inf, Inf),
3945
orientation = NA,
4046
show.legend = NA,
4147
inherit.aes = TRUE) {
@@ -55,6 +61,7 @@ stat_density <- function(mapping = NULL, data = NULL,
5561
n = n,
5662
trim = trim,
5763
na.rm = na.rm,
64+
bounds = bounds,
5865
orientation = orientation,
5966
...
6067
)
@@ -70,6 +77,8 @@ StatDensity <- ggproto("StatDensity", Stat,
7077

7178
default_aes = aes(x = after_stat(density), y = after_stat(density), fill = NA, weight = NULL),
7279

80+
dropped_aes = "weight",
81+
7382
setup_params = function(self, data, params) {
7483
params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE)
7584

@@ -85,7 +94,8 @@ StatDensity <- ggproto("StatDensity", Stat,
8594
extra_params = c("na.rm", "orientation"),
8695

8796
compute_group = function(data, scales, bw = "nrd0", adjust = 1, kernel = "gaussian",
88-
n = 512, trim = FALSE, na.rm = FALSE, flipped_aes = FALSE) {
97+
n = 512, trim = FALSE, na.rm = FALSE, bounds = c(-Inf, Inf),
98+
flipped_aes = FALSE) {
8999
data <- flip_data(data, flipped_aes)
90100
if (trim) {
91101
range <- range(data$x, na.rm = TRUE)
@@ -94,22 +104,30 @@ StatDensity <- ggproto("StatDensity", Stat,
94104
}
95105

96106
density <- compute_density(data$x, data$weight, from = range[1],
97-
to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n)
107+
to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n,
108+
bounds = bounds)
98109
density$flipped_aes <- flipped_aes
99110
flip_data(density, flipped_aes)
100111
}
101112

102113
)
103114

104115
compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
105-
kernel = "gaussian", n = 512) {
116+
kernel = "gaussian", n = 512,
117+
bounds = c(-Inf, Inf)) {
106118
nx <- length(x)
107119
if (is.null(w)) {
108120
w <- rep(1 / nx, nx)
109121
} else {
110122
w <- w / sum(w)
111123
}
112124

125+
# Adjust data points and weights to all fit inside bounds
126+
sample_data <- fit_data_to_bounds(bounds, x, w)
127+
x <- sample_data$x
128+
w <- sample_data$w
129+
nx <- length(x)
130+
113131
# if less than 2 points return data frame of NAs and a warning
114132
if (nx < 2) {
115133
cli::cli_warn("Groups with fewer than two data points have been dropped.")
@@ -124,8 +142,16 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
124142
))
125143
}
126144

127-
dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
128-
kernel = kernel, n = n, from = from, to = to)
145+
# Decide whether to use boundary correction
146+
if (any(is.finite(bounds))) {
147+
dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
148+
kernel = kernel, n = n)
149+
150+
dens <- reflect_density(dens = dens, bounds = bounds, from = from, to = to)
151+
} else {
152+
dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
153+
kernel = kernel, n = n, from = from, to = to)
154+
}
129155

130156
data_frame0(
131157
x = dens$x,
@@ -137,3 +163,57 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
137163
.size = length(dens$x)
138164
)
139165
}
166+
167+
# Check if all data points are inside bounds. If not, warn and remove them.
168+
fit_data_to_bounds <- function(bounds, x, w) {
169+
is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2])
170+
171+
if (any(!is_inside_bounds)) {
172+
cli::cli_warn("Some data points are outside of `bounds`. Removing them.")
173+
x <- x[is_inside_bounds]
174+
w <- w[is_inside_bounds]
175+
w_sum <- sum(w)
176+
if (w_sum > 0) {
177+
w <- w / w_sum
178+
}
179+
}
180+
181+
return(list(x = x, w = w))
182+
}
183+
184+
# Update density estimation to mitigate boundary effect at known `bounds`:
185+
# - All x values will lie inside `bounds`.
186+
# - All y-values will be updated to have total probability of `bounds` be
187+
# closer to 1. This is done by reflecting tails outside of `bounds` around
188+
# their closest edge. This leads to those tails lie inside of `bounds`
189+
# (completely, if they are not wider than `bounds` itself, which is a common
190+
# situation) and correct boundary effect of default density estimation.
191+
#
192+
# `dens` - output of `stats::density`.
193+
# `bounds` - two-element vector with left and right known (user supplied)
194+
# bounds of x values.
195+
# `from`, `to` - numbers used as corresponding arguments of `stats::density()`
196+
# in case of no boundary correction.
197+
reflect_density <- function(dens, bounds, from, to) {
198+
# No adjustment is needed if no finite bounds are supplied
199+
if (all(is.infinite(bounds))) {
200+
return(dens)
201+
}
202+
203+
# Estimate linearly with zero tails (crucial to account for infinite bound)
204+
f_dens <- stats::approxfun(
205+
x = dens$x, y = dens$y, method = "linear", yleft = 0, yright = 0
206+
)
207+
208+
# Create a uniform x-grid inside `bounds`
209+
left <- max(from, bounds[1])
210+
right <- min(to, bounds[2])
211+
out_x <- seq(from = left, to = right, length.out = length(dens$x))
212+
213+
# Update density estimation by adding reflected tails from outside `bounds`
214+
left_reflection <- f_dens(bounds[1] + (bounds[1] - out_x))
215+
right_reflection <- f_dens(bounds[2] + (bounds[2] - out_x))
216+
out_y <- f_dens(out_x) + left_reflection + right_reflection
217+
218+
list(x = out_x, y = out_y)
219+
}

man/geom_density.Rd

Lines changed: 13 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-stat-density.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,94 @@
1+
test_that("stat_density actually computes density", {
2+
# Compare functon approximations because outputs from `ggplot()` and
3+
# `density()` give grids spanning different ranges
4+
dens <- stats::density(mtcars$mpg)
5+
expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))
6+
7+
plot <- ggplot(mtcars, aes(mpg)) + stat_density()
8+
actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])
9+
10+
test_sample <- unique(mtcars$mpg)
11+
expect_equal(
12+
expected_density_fun(test_sample),
13+
actual_density_fun(test_sample),
14+
tolerance = 1e-3
15+
)
16+
})
17+
18+
test_that("stat_density can make weighted density estimation", {
19+
df <- mtcars
20+
df$weight <- mtcars$cyl / sum(mtcars$cyl)
21+
22+
dens <- stats::density(df$mpg, weights = df$weight)
23+
expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))
24+
25+
plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density()
26+
actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])
27+
28+
test_sample <- unique(df$mpg)
29+
expect_equal(
30+
expected_density_fun(test_sample),
31+
actual_density_fun(test_sample),
32+
tolerance = 1e-3
33+
)
34+
})
35+
36+
test_that("stat_density uses `bounds`", {
37+
mpg_min <- min(mtcars$mpg)
38+
mpg_max <- max(mtcars$mpg)
39+
40+
expect_bounds <- function(bounds) {
41+
dens <- stats::density(mtcars$mpg)
42+
orig_density <- stats::approxfun(
43+
data.frame(x = dens$x, y = dens$y),
44+
yleft = 0,
45+
yright = 0
46+
)
47+
48+
bounded_plot <- ggplot(mtcars, aes(mpg)) + stat_density(bounds = bounds)
49+
bounded_data <- layer_data(bounded_plot)[, c("x", "y")]
50+
plot_density <- stats::approxfun(bounded_data, yleft = 0, yright = 0)
51+
52+
test_sample <- seq(mpg_min, mpg_max, by = 0.1)
53+
left_reflection <- orig_density(bounds[1] + (bounds[1] - test_sample))
54+
right_reflection <- orig_density(bounds[2] + (bounds[2] - test_sample))
55+
56+
# Plot density should be an original plus added reflection at both `bounds`
57+
# (reflection around infinity is zero)
58+
expect_equal(
59+
orig_density(test_sample) + left_reflection + right_reflection,
60+
plot_density(test_sample),
61+
tolerance = 1e-4
62+
)
63+
}
64+
65+
expect_bounds(c(-Inf, Inf))
66+
expect_bounds(c(mpg_min, Inf))
67+
expect_bounds(c(-Inf, mpg_max))
68+
expect_bounds(c(mpg_min, mpg_max))
69+
})
70+
71+
test_that("stat_density handles data outside of `bounds`", {
72+
cutoff <- mtcars$mpg[1]
73+
74+
# Both `x` and `weight` should be filtered out for out of `bounds` points
75+
expect_warning(
76+
data_actual <- layer_data(
77+
ggplot(mtcars, aes(mpg, weight = cyl)) +
78+
stat_density(bounds = c(cutoff, Inf))
79+
),
80+
"outside of `bounds`"
81+
)
82+
83+
mtcars_filtered <- mtcars[mtcars$mpg >= cutoff, ]
84+
data_expected <- layer_data(
85+
ggplot(mtcars_filtered, aes(mpg, weight = cyl)) +
86+
stat_density(bounds = c(cutoff, Inf))
87+
)
88+
89+
expect_equal(data_actual, data_expected)
90+
})
91+
192
test_that("compute_density succeeds when variance is zero", {
293
dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5)
394
expect_equal(dens$n, rep(10, 512))

0 commit comments

Comments
 (0)