Skip to content

ENH: Add Rolling.nunique() #61087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,19 @@ class Methods:
["DataFrame", "Series"],
[("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
["int", "float"],
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
[
"median",
"mean",
"max",
"min",
"std",
"count",
"skew",
"kurt",
"sum",
"sem",
"nunique",
],
)
param_names = ["constructor", "window_kwargs", "dtype", "method"]

Expand Down
2 changes: 2 additions & 0 deletions doc/source/reference/window.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Rolling window functions
Rolling.quantile
Rolling.sem
Rolling.rank
Rolling.nunique

.. _api.functions_window:

Expand Down Expand Up @@ -86,6 +87,7 @@ Expanding window functions
Expanding.quantile
Expanding.sem
Expanding.rank
Expanding.nunique

.. _api.functions_ewm:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/window/aggregations.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def roll_rank(
method: WindowingRankType,
ascending: bool,
) -> np.ndarray: ... # np.ndarray[float]
def roll_nunique(
values: np.ndarray, # const float64_t[:]
start: np.ndarray, # np.ndarray[np.int64]
end: np.ndarray, # np.ndarray[np.int64]
minp: int, # int64_t
) -> np.ndarray: ... # np.ndarray[float]
def roll_apply(
obj: object,
start: np.ndarray, # np.ndarray[np.int64]
Expand Down
61 changes: 61 additions & 0 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from libc.math cimport (
sqrt,
)
from libcpp.deque cimport deque
from libcpp.unordered_map cimport unordered_map

from pandas._libs.algos cimport TiebreakEnumType

Expand Down Expand Up @@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
return np.asarray(output)


def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
"""
Rolling number of unique elements in the window
"""
cdef:
Py_ssize_t i, j, s, e, N = len(start)
int64_t nobs = 0
float64_t val
float64_t[::1] output
unordered_map[float64_t, int64_t] value_counts

is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
start, end
)
output = np.empty(N, dtype=np.float64)
value_counts = unordered_map[float64_t, int64_t]()

with nogil:
for i in range(N):
s = start[i]
e = end[i]

if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
if i != 0:
nobs = 0
value_counts.clear()

# setup
for j in range(s, e):
val = values[j]
if val == val:
nobs += 1
value_counts[val] += 1

else:
# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
if val == val:
value_counts[val] -= 1
if value_counts[val] == 0:
value_counts.erase(val)
nobs -= 1

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
if val == val:
nobs += 1
value_counts[val] += 1

if nobs >= minp:
output[i] = value_counts.size()
else:
output[i] = NaN

return np.asarray(output)


def roll_apply(object obj,
ndarray[int64_t] start, ndarray[int64_t] end,
int64_t minp,
Expand Down
35 changes: 35 additions & 0 deletions pandas/core/window/expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,41 @@ def rank(
numeric_only=numeric_only,
)

@doc(
template_header,
".. versionadded:: 3.0.0 \n\n",
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
>>> s.expanding().nunique()
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 5.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="nunique",
agg_method="nunique",
)
def nunique(
self,
numeric_only: bool = False,
):
return super().nunique(
numeric_only=numeric_only,
)

@doc(
template_header,
create_section_header("Parameters"),
Expand Down
47 changes: 47 additions & 0 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1799,6 +1799,16 @@ def rank(

return self._apply(window_func, name="rank", numeric_only=numeric_only)

def nunique(
self,
numeric_only: bool = False,
):
window_func = partial(
window_aggregations.roll_nunique,
)

return self._apply(window_func, name="nunique", numeric_only=numeric_only)

def cov(
self,
other: DataFrame | Series | None = None,
Expand Down Expand Up @@ -2855,6 +2865,43 @@ def rank(
numeric_only=numeric_only,
)

@doc(
template_header,
".. versionadded:: 3.0.0 \n\n",
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5])
>>> s.rolling(3).nunique()
0 NaN
1 NaN
2 3.0
3 NaN
4 NaN
5 NaN
6 2.0
7 3.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="rolling",
aggregation_description="nunique",
agg_method="nunique",
)
def nunique(
self,
numeric_only: bool = False,
):
return super().nunique(
numeric_only=numeric_only,
)

@doc(
template_header,
create_section_header("Parameters"),
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/window/test_cython_aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def _get_rolling_aggregations():
("roll_min", window_aggregations.roll_min),
("roll_first", window_aggregations.roll_first),
("roll_last", window_aggregations.roll_last),
("roll_nunique", window_aggregations.roll_nunique),
]
+ [
(
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
def test_nunique(window, test_data):
length = 20
if test_data == "default":
ser = Series(data=np.random.default_rng(2).random(length))
elif test_data == "duplicates":
ser = Series(data=np.random.default_rng(2).choice(3, length))
elif test_data == "nans":
ser = Series(
data=np.random.default_rng(2).choice(
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
)
)
elif test_data == "precision":
ser = Series(
data=[
0.3,
0.1 * 3, # Not necessarily exactly 0.3
0.6,
0.2 * 3, # Not necessarily exactly 0.6
0.9,
0.3 * 3, # Not necessarily exactly 0.9
0.5,
0.1 * 5, # Not necessarily exactly 0.5
0.8,
0.2 * 4, # Not necessarily exactly 0.8
],
dtype=np.float64,
)

expected = ser.expanding(window).apply(lambda x: x.nunique())
result = ser.expanding(window).nunique()

tm.assert_series_equal(result, expected)


def test_expanding_corr(series):
A = series.dropna()
B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
Expand Down
15 changes: 14 additions & 1 deletion pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame):
"count",
"kurt",
"skew",
"nunique",
],
)
def test_rolling(self, f, roll_frame):
Expand Down Expand Up @@ -1034,7 +1035,19 @@ def frame(self):
return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})

@pytest.mark.parametrize(
"f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"]
"f",
[
"sum",
"mean",
"min",
"max",
"first",
"last",
"count",
"kurt",
"skew",
"nunique",
],
)
def test_expanding(self, f, frame):
g = frame.groupby("A", group_keys=False)
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
def test_nunique(window, test_data):
length = 20
if test_data == "default":
ser = Series(data=np.random.default_rng(2).random(length))
elif test_data == "duplicates":
ser = Series(data=np.random.default_rng(2).choice(3, length))
elif test_data == "nans":
ser = Series(
data=np.random.default_rng(2).choice(
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
)
)
elif test_data == "precision":
ser = Series(
data=[
0.3,
0.1 * 3, # Not necessarily exactly 0.3
0.6,
0.2 * 3, # Not necessarily exactly 0.6
0.9,
0.3 * 3, # Not necessarily exactly 0.9
0.5,
0.1 * 5, # Not necessarily exactly 0.5
0.8,
0.2 * 4, # Not necessarily exactly 0.8
],
dtype=np.float64,
)

expected = ser.rolling(window).apply(lambda x: x.nunique())
result = ser.rolling(window).nunique()

tm.assert_series_equal(result, expected)


def test_rolling_quantile_np_percentile():
# #9413: Tests that rolling window's quantile default behavior
# is analogous to Numpy's percentile
Expand Down
Loading