From e2cc83a473e87145d93ca87fffcfc4a4fac5e027 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sat, 8 Mar 2025 14:14:59 -0800
Subject: [PATCH 1/3] ENH: Add Rolling.nunique()

---
 asv_bench/benchmarks/rolling.py               | 14 +++-
 doc/source/reference/window.rst               |  2 +
 doc/source/whatsnew/v3.0.0.rst                |  1 +
 pandas/_libs/window/aggregations.pyi          |  6 ++
 pandas/_libs/window/aggregations.pyx          | 64 +++++++++++++++++++
 pandas/core/window/rolling.py                 | 47 ++++++++++++++
 .../tests/window/test_cython_aggregations.py  |  1 +
 pandas/tests/window/test_expanding.py         | 21 ++++++
 pandas/tests/window/test_groupby.py           | 15 ++++-
 pandas/tests/window/test_rolling.py           | 21 ++++++
 10 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index bd4da00bfd2ad..f9a5f38c2e349 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -10,7 +10,19 @@ class Methods:
         ["DataFrame", "Series"],
         [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
         ["int", "float"],
-        ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
+        [
+            "median",
+            "mean",
+            "max",
+            "min",
+            "std",
+            "count",
+            "skew",
+            "kurt",
+            "sum",
+            "sem",
+            "nunique",
+        ],
     )
     param_names = ["constructor", "window_kwargs", "dtype", "method"]
 
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
index 2aeb57faac112..2bd63f02faf69 100644
--- a/doc/source/reference/window.rst
+++ b/doc/source/reference/window.rst
@@ -42,6 +42,7 @@ Rolling window functions
    Rolling.quantile
    Rolling.sem
    Rolling.rank
+   Rolling.nunique
 
 .. _api.functions_window:
 
@@ -86,6 +87,7 @@ Expanding window functions
    Expanding.quantile
    Expanding.sem
    Expanding.rank
+   Expanding.nunique
 
 .. _api.functions_ewm:
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 9fab1d12fc6a5..99b5d4e0ac9eb 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -62,6 +62,7 @@ Other enhancements
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
+- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
index ee735761e3dc6..b4bdd7e05cf0e 100644
--- a/pandas/_libs/window/aggregations.pyi
+++ b/pandas/_libs/window/aggregations.pyi
@@ -89,6 +89,12 @@ def roll_rank(
     method: WindowingRankType,
     ascending: bool,
 ) -> np.ndarray: ...  # np.ndarray[float]
+def roll_nunique(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
 def roll_apply(
     obj: object,
     start: np.ndarray,  # np.ndarray[np.int64]
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index d33c840371d2a..fccb23055ed28 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -6,6 +6,7 @@ from libc.math cimport (
     sqrt,
 )
 from libcpp.deque cimport deque
+from libcpp.unordered_map cimport unordered_map
 
 from pandas._libs.algos cimport TiebreakEnumType
 
@@ -1470,6 +1471,69 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
     return np.asarray(output)
 
 
+def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
+                 ndarray[int64_t] end, int64_t minp) -> np.ndarray:
+    """
+    Rolling number of unique elements in the window
+    """
+    cdef:
+        Py_ssize_t i, j, s, e, N = len(start)
+        int64_t nobs = 0, num_unique = 0
+        float64_t val
+        float64_t[::1] output
+        unordered_map[float64_t, int64_t] value_counts
+
+    is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
+        start, end
+    )
+    output = np.empty(N, dtype=np.float64)
+    value_counts = unordered_map[float64_t, int64_t]()
+
+    with nogil:
+        for i in range(N):
+            s = start[i]
+            e = end[i]
+
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
+                if i != 0:
+                    nobs = 0
+                    value_counts.clear()
+
+                # setup
+                for j in range(s, e):
+                    val = values[j]
+                    if val == val:
+                        nobs += 1
+                        value_counts[val] += 1
+
+                num_unique = value_counts.size()
+
+            else:
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    val = values[j]
+                    if val == val:
+                        value_counts[val] -= 1
+                        if value_counts[val] == 0:
+                            value_counts.erase(val)
+                        nobs -= 1
+
+                # calculate adds
+                for j in range(end[i - 1], e):
+                    val = values[j]
+                    if val == val:
+                        nobs += 1
+                        value_counts[val] += 1
+
+                num_unique = value_counts.size()
+            if nobs >= minp:
+                output[i] = num_unique
+            else:
+                output[i] = NaN
+
+    return np.asarray(output)
+
+
 def roll_apply(object obj,
                ndarray[int64_t] start, ndarray[int64_t] end,
                int64_t minp,
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
index 69fce8cf2137e..03534bbee4c58 100644
--- a/pandas/core/window/rolling.py
+++ b/pandas/core/window/rolling.py
@@ -1799,6 +1799,16 @@ def rank(
 
         return self._apply(window_func, name="rank", numeric_only=numeric_only)
 
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        window_func = partial(
+            window_aggregations.roll_nunique,
+        )
+
+        return self._apply(window_func, name="nunique", numeric_only=numeric_only)
+
     def cov(
         self,
         other: DataFrame | Series | None = None,
@@ -2855,6 +2865,43 @@ def rank(
             numeric_only=numeric_only,
         )
 
+    @doc(
+        template_header,
+        ".. versionadded:: 3.0.0 \n\n",
+        create_section_header("Parameters"),
+        kwargs_numeric_only,
+        create_section_header("Returns"),
+        template_returns,
+        create_section_header("See Also"),
+        template_see_also,
+        create_section_header("Examples"),
+        dedent(
+            """
+        >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5])
+        >>> s.rolling(3).nunique()
+        0    NaN
+        1    NaN
+        2    3.0
+        3    NaN
+        4    NaN
+        5    NaN
+        6    2.0
+        7    3.0
+        dtype: float64
+        """
+        ).replace("\n", "", 1),
+        window_method="rolling",
+        aggregation_description="nunique",
+        agg_method="nunique",
+    )
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        return super().nunique(
+            numeric_only=numeric_only,
+        )
+
     @doc(
         template_header,
         create_section_header("Parameters"),
diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py
index feb25a294c540..39811ea3ec5b9 100644
--- a/pandas/tests/window/test_cython_aggregations.py
+++ b/pandas/tests/window/test_cython_aggregations.py
@@ -32,6 +32,7 @@ def _get_rolling_aggregations():
             ("roll_min", window_aggregations.roll_min),
             ("roll_first", window_aggregations.roll_first),
             ("roll_last", window_aggregations.roll_last),
+            ("roll_nunique", window_aggregations.roll_nunique),
         ]
         + [
             (
diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
index 39cedc3b692da..db2d6a8ca4f5a 100644
--- a/pandas/tests/window/test_expanding.py
+++ b/pandas/tests/window/test_expanding.py
@@ -255,6 +255,27 @@ def test_rank(window, method, pct, ascending, test_data):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+
+    expected = ser.expanding(window).apply(lambda x: x.nunique())
+    result = ser.expanding(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
 def test_expanding_corr(series):
     A = series.dropna()
     B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
index 392239b8adadd..1dcdad2bfd73d 100644
--- a/pandas/tests/window/test_groupby.py
+++ b/pandas/tests/window/test_groupby.py
@@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame):
             "count",
             "kurt",
             "skew",
+            "nunique",
         ],
     )
     def test_rolling(self, f, roll_frame):
@@ -1034,7 +1035,19 @@ def frame(self):
         return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
 
     @pytest.mark.parametrize(
-        "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"]
+        "f",
+        [
+            "sum",
+            "mean",
+            "min",
+            "max",
+            "first",
+            "last",
+            "count",
+            "kurt",
+            "skew",
+            "nunique",
+        ],
     )
     def test_expanding(self, f, frame):
         g = frame.groupby("A", group_keys=False)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index 2aaa35ec5ec2c..9e63ec791d893 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -1586,6 +1586,27 @@ def test_rank(window, method, pct, ascending, test_data):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+
+    expected = ser.rolling(window).apply(lambda x: x.nunique())
+    result = ser.rolling(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
 def test_rolling_quantile_np_percentile():
     # #9413: Tests that rolling window's quantile default behavior
     # is analogous to Numpy's percentile

From 5708d85aae6a6f31f305335b1a6a083cc1df55f6 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sat, 8 Mar 2025 14:23:34 -0800
Subject: [PATCH 2/3] Add docstring for Expanding.nunique()

---
 pandas/core/window/expanding.py | 35 +++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
index 81c89e1ef5428..bff3485c9cb86 100644
--- a/pandas/core/window/expanding.py
+++ b/pandas/core/window/expanding.py
@@ -927,6 +927,41 @@ def rank(
             numeric_only=numeric_only,
         )
 
+    @doc(
+        template_header,
+        ".. versionadded:: 3.0.0 \n\n",
+        create_section_header("Parameters"),
+        kwargs_numeric_only,
+        create_section_header("Returns"),
+        template_returns,
+        create_section_header("See Also"),
+        template_see_also,
+        create_section_header("Examples"),
+        dedent(
+            """
+        >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+        >>> s.expanding().nunique()
+        0    1.0
+        1    2.0
+        2    3.0
+        3    4.0
+        4    5.0
+        5    5.0
+        dtype: float64
+        """
+        ).replace("\n", "", 1),
+        window_method="expanding",
+        aggregation_description="nunique",
+        agg_method="nunique",
+    )
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        return super().nunique(
+            numeric_only=numeric_only,
+        )
+
     @doc(
         template_header,
         create_section_header("Parameters"),

From e6551982466468c9da0b873c786af2ab8d2b9b07 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sat, 8 Mar 2025 15:43:09 -0800
Subject: [PATCH 3/3] Add a test for float precision issues

---
 pandas/_libs/window/aggregations.pyx  |  7 ++-----
 pandas/tests/window/test_expanding.py | 18 +++++++++++++++++-
 pandas/tests/window/test_rolling.py   | 18 +++++++++++++++++-
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index fccb23055ed28..2baed13cbd7be 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -1478,7 +1478,7 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
     """
     cdef:
         Py_ssize_t i, j, s, e, N = len(start)
-        int64_t nobs = 0, num_unique = 0
+        int64_t nobs = 0
         float64_t val
         float64_t[::1] output
         unordered_map[float64_t, int64_t] value_counts
@@ -1506,8 +1506,6 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
                         nobs += 1
                         value_counts[val] += 1
 
-                num_unique = value_counts.size()
-
             else:
                 # calculate deletes
                 for j in range(start[i - 1], s):
@@ -1525,9 +1523,8 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
                         nobs += 1
                         value_counts[val] += 1
 
-                num_unique = value_counts.size()
             if nobs >= minp:
-                output[i] = num_unique
+                output[i] = value_counts.size()
             else:
                 output[i] = NaN
 
diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
index db2d6a8ca4f5a..2c96ce01c6328 100644
--- a/pandas/tests/window/test_expanding.py
+++ b/pandas/tests/window/test_expanding.py
@@ -256,7 +256,7 @@ def test_rank(window, method, pct, ascending, test_data):
 
 
 @pytest.mark.parametrize("window", [1, 3, 10, 20])
-@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
 def test_nunique(window, test_data):
     length = 20
     if test_data == "default":
@@ -269,6 +269,22 @@ def test_nunique(window, test_data):
                 [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
             )
         )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
 
     expected = ser.expanding(window).apply(lambda x: x.nunique())
     result = ser.expanding(window).nunique()
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index 9e63ec791d893..8c57781c1447c 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -1587,7 +1587,7 @@ def test_rank(window, method, pct, ascending, test_data):
 
 
 @pytest.mark.parametrize("window", [1, 3, 10, 20])
-@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
 def test_nunique(window, test_data):
     length = 20
     if test_data == "default":
@@ -1600,6 +1600,22 @@ def test_nunique(window, test_data):
                 [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
             )
         )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
 
     expected = ser.rolling(window).apply(lambda x: x.nunique())
     result = ser.rolling(window).nunique()