From e2cc83a473e87145d93ca87fffcfc4a4fac5e027 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 8 Mar 2025 14:14:59 -0800 Subject: [PATCH 1/3] ENH: Add Rolling.nunique() --- asv_bench/benchmarks/rolling.py | 14 +++- doc/source/reference/window.rst | 2 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/window/aggregations.pyi | 6 ++ pandas/_libs/window/aggregations.pyx | 64 +++++++++++++++++++ pandas/core/window/rolling.py | 47 ++++++++++++++ .../tests/window/test_cython_aggregations.py | 1 + pandas/tests/window/test_expanding.py | 21 ++++++ pandas/tests/window/test_groupby.py | 15 ++++- pandas/tests/window/test_rolling.py | 21 ++++++ 10 files changed, 190 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index bd4da00bfd2ad..f9a5f38c2e349 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -10,7 +10,19 @@ class Methods: ["DataFrame", "Series"], [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], + [ + "median", + "mean", + "max", + "min", + "std", + "count", + "skew", + "kurt", + "sum", + "sem", + "nunique", + ], ) param_names = ["constructor", "window_kwargs", "dtype", "method"] diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 2aeb57faac112..2bd63f02faf69 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -42,6 +42,7 @@ Rolling window functions Rolling.quantile Rolling.sem Rolling.rank + Rolling.nunique .. _api.functions_window: @@ -86,6 +87,7 @@ Expanding window functions Expanding.quantile Expanding.sem Expanding.rank + Expanding.nunique .. _api.functions_ewm: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9fab1d12fc6a5..99b5d4e0ac9eb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -62,6 +62,7 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index ee735761e3dc6..b4bdd7e05cf0e 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -89,6 +89,12 @@ def roll_rank( method: WindowingRankType, ascending: bool, ) -> np.ndarray: ... # np.ndarray[float] +def roll_nunique( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index d33c840371d2a..fccb23055ed28 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -6,6 +6,7 @@ from libc.math cimport ( sqrt, ) from libcpp.deque cimport deque +from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport TiebreakEnumType @@ -1470,6 +1471,69 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, return np.asarray(output) +def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + """ + Rolling number of unique elements in the window + """ + cdef: + Py_ssize_t i, j, s, e, N = len(start) + int64_t nobs = 0, num_unique = 0 + float64_t val + float64_t[::1] output + unordered_map[float64_t, int64_t] value_counts + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + output = np.empty(N, dtype=np.float64) + value_counts = unordered_map[float64_t, int64_t]() + + with nogil: + for i in range(N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + if i != 0: + nobs = 0 + value_counts.clear() + + # setup + for j in range(s, e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + num_unique = value_counts.size() + + else: + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + value_counts[val] -= 1 + if value_counts[val] == 0: + value_counts.erase(val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + num_unique = value_counts.size() + if nobs >= minp: + output[i] = num_unique + else: + output[i] = NaN + + return np.asarray(output) + + def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 69fce8cf2137e..03534bbee4c58 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1799,6 +1799,16 @@ def rank( return self._apply(window_func, name="rank", numeric_only=numeric_only) + def nunique( + self, + numeric_only: bool = False, + ): + window_func = partial( + window_aggregations.roll_nunique, + ) + + return self._apply(window_func, name="nunique", numeric_only=numeric_only) + def cov( self, other: DataFrame | Series | None = None, @@ -2855,6 +2865,43 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5]) + >>> s.rolling(3).nunique() + 0 NaN + 1 NaN + 2 3.0 + 3 NaN + 4 NaN + 5 NaN + 6 2.0 + 7 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index feb25a294c540..39811ea3ec5b9 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -32,6 +32,7 @@ def _get_rolling_aggregations(): ("roll_min", window_aggregations.roll_min), ("roll_first", window_aggregations.roll_first), ("roll_last", window_aggregations.roll_last), + ("roll_nunique", window_aggregations.roll_nunique), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 39cedc3b692da..db2d6a8ca4f5a 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -255,6 +255,27 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + + expected = ser.expanding(window).apply(lambda x: x.nunique()) + result = ser.expanding(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_expanding_corr(series): A = series.dropna() B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5] diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 392239b8adadd..1dcdad2bfd73d 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame): "count", "kurt", "skew", + "nunique", ], ) def test_rolling(self, f, roll_frame): @@ -1034,7 +1035,19 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] + "f", + [ + "sum", + "mean", + "min", + "max", + "first", + "last", + "count", + "kurt", + "skew", + "nunique", + ], ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2aaa35ec5ec2c..9e63ec791d893 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1586,6 +1586,27 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + + expected = ser.rolling(window).apply(lambda x: x.nunique()) + result = ser.rolling(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_rolling_quantile_np_percentile(): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile From 5708d85aae6a6f31f305335b1a6a083cc1df55f6 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 8 Mar 2025 14:23:34 -0800 Subject: [PATCH 2/3] Add docstring for Expanding.nunique() --- pandas/core/window/expanding.py | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 81c89e1ef5428..bff3485c9cb86 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -927,6 +927,41 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) + >>> s.expanding().nunique() + 0 1.0 + 1 2.0 + 2 3.0 + 3 4.0 + 4 5.0 + 5 5.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), From e6551982466468c9da0b873c786af2ab8d2b9b07 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 8 Mar 2025 15:43:09 -0800 Subject: [PATCH 3/3] Add a test for float precision issues --- pandas/_libs/window/aggregations.pyx | 7 ++----- pandas/tests/window/test_expanding.py | 18 +++++++++++++++++- pandas/tests/window/test_rolling.py | 18 +++++++++++++++++- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index fccb23055ed28..2baed13cbd7be 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1478,7 +1478,7 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, """ cdef: Py_ssize_t i, j, s, e, N = len(start) - int64_t nobs = 0, num_unique = 0 + int64_t nobs = 0 float64_t val float64_t[::1] output unordered_map[float64_t, int64_t] value_counts @@ -1506,8 +1506,6 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, nobs += 1 value_counts[val] += 1 - num_unique = value_counts.size() - else: # calculate deletes for j in range(start[i - 1], s): @@ -1525,9 +1523,8 @@ def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, nobs += 1 value_counts[val] += 1 - num_unique = value_counts.size() if nobs >= minp: - output[i] = num_unique + output[i] = value_counts.size() else: output[i] = NaN diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index db2d6a8ca4f5a..2c96ce01c6328 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -256,7 +256,7 @@ def test_rank(window, method, pct, ascending, test_data): @pytest.mark.parametrize("window", [1, 3, 10, 20]) -@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) def test_nunique(window, test_data): length = 20 if test_data == "default": @@ -269,6 +269,22 @@ def test_nunique(window, test_data): [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length ) ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) expected = ser.expanding(window).apply(lambda x: x.nunique()) result = ser.expanding(window).nunique() diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9e63ec791d893..8c57781c1447c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1587,7 +1587,7 @@ def test_rank(window, method, pct, ascending, test_data): @pytest.mark.parametrize("window", [1, 3, 10, 20]) -@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) def test_nunique(window, test_data): length = 20 if test_data == "default": @@ -1600,6 +1600,22 @@ def test_nunique(window, test_data): [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length ) ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) expected = ser.rolling(window).apply(lambda x: x.nunique()) result = ser.rolling(window).nunique()