From fa74733fa80f954a048fe1af20938f16f4c14201 Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Sun, 14 Apr 2024 02:02:04 -0400 Subject: [PATCH 01/12] Added tests with and without np.nan --- pandas/tests/series/test_cumulative.py | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 9b7b08127a550..bfda8dca5a9e9 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -170,6 +170,38 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): result = getattr(ser, method)() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cab"], + ], + ) + def test_cummax_cummin_on_ordered_categorical(self, method, order): + ser = pd.Series( + list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) + ) + result = getattr(ser, method)() + tm.assert_series_equal(result, pd.Series(list("abbbccc"))) + + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cab"], + ], + ) + def test_cummax_cummin_ordered_categorical_nan(self, method, order): + ser = pd.Series( + ["a", np.nan, "b", "a", "c"], + dtype=pd.CategoricalDtype(list(order), ordered=True), + ) + result = getattr(ser, method)(skipna=True) + tm.assert_series_equal(result, pd.Series(["a", "NaN", "b", "b", "c"])) + + result = getattr(ser, method)(skipna=False) + tm.assert_series_equal(result, pd.Series(["a", "NaN", "NaN", "NaN", "NaN"])) + def test_cumprod_timedelta(self): # GH#48111 ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) From 269dcfbe3055823be7c471939fd5fe7be5a6744b Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Tue, 16 Apr 2024 01:47:25 -0400 Subject: [PATCH 02/12] Added tests for cummin and cummax --- .../arrays/categorical/test_cumulative.py | 45 +++++++++++++++++++ pandas/tests/series/test_cumulative.py | 2 + 2 files changed, 47 insertions(+) create mode 100644 pandas/tests/arrays/categorical/test_cumulative.py diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py new file mode 100644 index 0000000000000..fdc8cc1d316d9 --- /dev/null +++ b/pandas/tests/arrays/categorical/test_cumulative.py @@ -0,0 +1,45 @@ +""" +Tests for Ordered Categorical Array cumulative operations. +""" + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestAccumulator: + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cab"], + ], + ) + def test_cummax_cummin_on_ordered_categorical(self, method, order): + # GH#52335 + arr = pd.array( + list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) + ) + result = getattr(arr, method)() + tm.assert_series_equal(result, pd.Series(list("abbbccc"))) + + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cab"], + ], + ) + def test_cummax_cummin_ordered_categorical_nan(self, method, order): + # GH#52335 + arr = pd.array( + ["a", np.nan, "b", "a", "c"], + dtype=pd.CategoricalDtype(list(order), ordered=True), + ) + result = getattr(arr, method)(skipna=True) + tm.assert_series_equal(result, pd.array(["a", "NaN", "b", "b", "c"])) + + result = getattr(arr, method)(skipna=False) + tm.assert_series_equal(result, pd.array(["a", "NaN", "NaN", "NaN", "NaN"])) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index bfda8dca5a9e9..33ddc92124811 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -178,6 +178,7 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): ], ) def test_cummax_cummin_on_ordered_categorical(self, method, order): + # GH#52335 ser = pd.Series( list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) ) @@ -192,6 +193,7 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): ], ) def test_cummax_cummin_ordered_categorical_nan(self, method, order): + # GH#52335 ser = pd.Series( ["a", np.nan, "b", "a", "c"], dtype=pd.CategoricalDtype(list(order), ordered=True), From 9ec47bfe79a9617d035c40abe9ccd984574c291d Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Tue, 16 Apr 2024 23:26:46 -0400 Subject: [PATCH 03/12] Fixed series tests expected series, rewrote categorical arrays to use pd.Categorical --- .../arrays/categorical/test_cumulative.py | 37 ++++++++----------- pandas/tests/series/test_cumulative.py | 14 +++++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py index fdc8cc1d316d9..ba80ca88bade0 100644 --- a/pandas/tests/arrays/categorical/test_cumulative.py +++ b/pandas/tests/arrays/categorical/test_cumulative.py @@ -11,35 +11,28 @@ class TestAccumulator: @pytest.mark.parametrize( - "method, order", + "method, input, output", [ - ["cummax", "abc"], - ["cummin", "cab"], + ["cummin", [1, 2, 1, 2, 3, 3, 2, 1], [1, 2, 2, 2, 3, 3, 3, 3]], + ["commin", [3, 2, 3, 2, 1, 1, 2, 3], [3, 2, 2, 2, 1, 1, 1, 1]], ], ) - def test_cummax_cummin_on_ordered_categorical(self, method, order): + def test_cummax_cummin_on_ordered_categorical(self, method, input, output): # GH#52335 - arr = pd.array( - list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) - ) - result = getattr(arr, method)() - tm.assert_series_equal(result, pd.Series(list("abbbccc"))) + result = pd.Categorical(input, ordered=True)._accumulate(method) + tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) @pytest.mark.parametrize( - "method, order", + "method, input, output", [ - ["cummax", "abc"], - ["cummin", "cab"], + ["cummax", [1, np.nan, 2, 1, 3], [1, np.nan, 2, 2, 3]], + ["commin", [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], ], ) - def test_cummax_cummin_ordered_categorical_nan(self, method, order): + def test_cummax_cummin_ordered_categorical_nan(self, method, input, output): # GH#52335 - arr = pd.array( - ["a", np.nan, "b", "a", "c"], - dtype=pd.CategoricalDtype(list(order), ordered=True), - ) - result = getattr(arr, method)(skipna=True) - tm.assert_series_equal(result, pd.array(["a", "NaN", "b", "b", "c"])) - - result = getattr(arr, method)(skipna=False) - tm.assert_series_equal(result, pd.array(["a", "NaN", "NaN", "NaN", "NaN"])) + result = pd.Categorical(input, ordered=True)._accumulate(method) + tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) + result = pd.Categorical(input, ordered=True)._accumulate(method) + output[2:] = np.nan + tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=False)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 33ddc92124811..a8190b9ab9902 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -179,11 +179,12 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): ) def test_cummax_cummin_on_ordered_categorical(self, method, order): # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) ser = pd.Series( list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) ) result = getattr(ser, method)() - tm.assert_series_equal(result, pd.Series(list("abbbccc"))) + tm.assert_series_equal(result, pd.Series(list("abbbccc"), dtype=cat)) @pytest.mark.parametrize( "method, order", @@ -194,15 +195,20 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): ) def test_cummax_cummin_ordered_categorical_nan(self, method, order): # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) ser = pd.Series( ["a", np.nan, "b", "a", "c"], - dtype=pd.CategoricalDtype(list(order), ordered=True), + dtype=cat, ) result = getattr(ser, method)(skipna=True) - tm.assert_series_equal(result, pd.Series(["a", "NaN", "b", "b", "c"])) + tm.assert_series_equal( + result, pd.Series(["a", np.nan, "b", "b", "c"], dtype=cat) + ) result = getattr(ser, method)(skipna=False) - tm.assert_series_equal(result, pd.Series(["a", "NaN", "NaN", "NaN", "NaN"])) + tm.assert_series_equal( + result, pd.Series(["a", np.nan, np.nan, np.nan, np.nan], dtype=cat) + ) def test_cumprod_timedelta(self): # GH#48111 From c224faf66f1dd38b1a0b4eef08368f6b7fe4d649 Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Tue, 16 Apr 2024 23:35:17 -0400 Subject: [PATCH 04/12] Fixed cat not defined error and misspelling --- .../tests/arrays/categorical/test_cumulative.py | 4 ++-- pandas/tests/series/test_cumulative.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py index ba80ca88bade0..7aefb4c85de81 100644 --- a/pandas/tests/arrays/categorical/test_cumulative.py +++ b/pandas/tests/arrays/categorical/test_cumulative.py @@ -14,7 +14,7 @@ class TestAccumulator: "method, input, output", [ ["cummin", [1, 2, 1, 2, 3, 3, 2, 1], [1, 2, 2, 2, 3, 3, 3, 3]], - ["commin", [3, 2, 3, 2, 1, 1, 2, 3], [3, 2, 2, 2, 1, 1, 1, 1]], + ["cummin", [3, 2, 3, 2, 1, 1, 2, 3], [3, 2, 2, 2, 1, 1, 1, 1]], ], ) def test_cummax_cummin_on_ordered_categorical(self, method, input, output): @@ -26,7 +26,7 @@ def test_cummax_cummin_on_ordered_categorical(self, method, input, output): "method, input, output", [ ["cummax", [1, np.nan, 2, 1, 3], [1, np.nan, 2, 2, 3]], - ["commin", [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], + ["cummin", [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], ], ) def test_cummax_cummin_ordered_categorical_nan(self, method, input, output): diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a8190b9ab9902..4bc8777856923 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -195,19 +195,26 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): ) def test_cummax_cummin_ordered_categorical_nan(self, method, order): # GH#52335 - cat = pd.CategoricalDtype(list(order), ordered=True) ser = pd.Series( ["a", np.nan, "b", "a", "c"], - dtype=cat, + dtype=pd.CategoricalDtype(list(order), ordered=True), ) result = getattr(ser, method)(skipna=True) tm.assert_series_equal( - result, pd.Series(["a", np.nan, "b", "b", "c"], dtype=cat) + result, + pd.Series( + ["a", np.nan, "b", "b", "c"], + dtype=pd.CategoricalDtype(list(order), ordered=True), + ), ) result = getattr(ser, method)(skipna=False) tm.assert_series_equal( - result, pd.Series(["a", np.nan, np.nan, np.nan, np.nan], dtype=cat) + result, + pd.Series( + ["a", np.nan, np.nan, np.nan, np.nan], + dtype=pd.CategoricalDtype(list(order), ordered=True), + ), ) def test_cumprod_timedelta(self): From 330b60d6be815e6b095c116bd6128efc591c76ca Mon Sep 17 00:00:00 2001 From: bdwzhang Date: Tue, 16 Apr 2024 22:52:45 -0500 Subject: [PATCH 05/12] Implement _accumulate for Categorical --- .../array_algos/categorical_accumulations.py | 60 +++++++++++++++++++ pandas/core/arrays/categorical.py | 14 +++++ 2 files changed, 74 insertions(+) create mode 100644 pandas/core/array_algos/categorical_accumulations.py diff --git a/pandas/core/array_algos/categorical_accumulations.py b/pandas/core/array_algos/categorical_accumulations.py new file mode 100644 index 0000000000000..34c68847d166f --- /dev/null +++ b/pandas/core/array_algos/categorical_accumulations.py @@ -0,0 +1,60 @@ +""" +categorical_accumulations.py is for accumulation algorithms using a mask-based +approach for missing values. +""" + +from __future__ import annotations + +from typing import Callable + +import numpy as np + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +) -> np.ndarray: + """ + Accumulations for 1D categorical arrays. + + We will modify values in place to replace NAs with the appropriate fill value. + + Parameters + ---------- + func : np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy integer array with the values and with NAs being -1. + skipna : bool, default True + Whether to skip NA. + """ + dtype_info = np.iinfo(values.dtype.type) + try: + fill_value = { + np.maximum.accumulate: dtype_info.min, + np.minimum.accumulate: dtype_info.max, + }[func] + except KeyError as err: + raise NotImplementedError( + f"No accumulation for {func} implemented on BaseMaskedArray" + ) from err + + mask = values == -1 + values[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + values[mask] = -1 + + return values + + +def cummin(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: + return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3af4c528ceae8..7e9266550ba20 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -71,6 +71,7 @@ factorize, take_nd, ) +from pandas.core.array_algos import categorical_accumulations from pandas.core.arrays._mixins import ( NDArrayBackedExtensionArray, ravel_compat, @@ -2509,6 +2510,19 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other._codes) return False + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> Self: + if name not in {"cummin", "cummax"}: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + + self.check_for_ordered(name) + + codes = self.codes.copy() + + op = getattr(categorical_accumulations, name) + codes = op(codes, skipna=skipna, **kwargs) + + return self._simple_new(codes, dtype=self._dtype) + @classmethod def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: from pandas.core.dtypes.concat import union_categoricals From 0122334f330b664a3cb1ae18fd431fa653cd0b06 Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Wed, 17 Apr 2024 00:14:15 -0400 Subject: [PATCH 06/12] fixed misspellings in tests --- .../arrays/categorical/test_cumulative.py | 27 ++++++++++++------- pandas/tests/series/test_cumulative.py | 4 +-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py index 7aefb4c85de81..6add03335c792 100644 --- a/pandas/tests/arrays/categorical/test_cumulative.py +++ b/pandas/tests/arrays/categorical/test_cumulative.py @@ -13,7 +13,7 @@ class TestAccumulator: @pytest.mark.parametrize( "method, input, output", [ - ["cummin", [1, 2, 1, 2, 3, 3, 2, 1], [1, 2, 2, 2, 3, 3, 3, 3]], + ["cummax", [1, 2, 1, 2, 3, 3, 2, 1], [1, 2, 2, 2, 3, 3, 3, 3]], ["cummin", [3, 2, 3, 2, 1, 1, 2, 3], [3, 2, 2, 2, 1, 1, 1, 1]], ], ) @@ -23,16 +23,25 @@ def test_cummax_cummin_on_ordered_categorical(self, method, input, output): tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) @pytest.mark.parametrize( - "method, input, output", + "method, skip, input, output", [ - ["cummax", [1, np.nan, 2, 1, 3], [1, np.nan, 2, 2, 3]], - ["cummin", [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], + ["cummax", True, [1, np.nan, 2, 1, 3], [1, np.nan, 2, 2, 3]], + [ + "cummax", + False, + [1, np.nan, 2, 1, 3], + [1, np.nan, np.nan, np.nan, np.nan], + ], + ["cummin", True, [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], + [ + "cummin", + False, + [3, np.nan, 2, 3, 1], + [3, np.nan, np.nan, np.nan, np.nan], + ], ], ) - def test_cummax_cummin_ordered_categorical_nan(self, method, input, output): + def test_cummax_cummin_ordered_categorical_nan(self, skip, method, input, output): # GH#52335 - result = pd.Categorical(input, ordered=True)._accumulate(method) + result = pd.Categorical(input, ordered=True)._accumulate(method, skipna=skip) tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) - result = pd.Categorical(input, ordered=True)._accumulate(method) - output[2:] = np.nan - tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=False)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 4bc8777856923..bfa8b6a11aafb 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -174,7 +174,7 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): "method, order", [ ["cummax", "abc"], - ["cummin", "cab"], + ["cummin", "cba"], ], ) def test_cummax_cummin_on_ordered_categorical(self, method, order): @@ -190,7 +190,7 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): "method, order", [ ["cummax", "abc"], - ["cummin", "cab"], + ["cummin", "cba"], ], ) def test_cummax_cummin_ordered_categorical_nan(self, method, order): From 098ad644464244f1ba8ccb677e7cab306710380d Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Sun, 21 Apr 2024 15:48:19 -0400 Subject: [PATCH 07/12] fixed expected categories on tests --- pandas/tests/arrays/categorical/test_cumulative.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py index 6add03335c792..ffd5a3b59beab 100644 --- a/pandas/tests/arrays/categorical/test_cumulative.py +++ b/pandas/tests/arrays/categorical/test_cumulative.py @@ -44,4 +44,6 @@ def test_cummax_cummin_on_ordered_categorical(self, method, input, output): def test_cummax_cummin_ordered_categorical_nan(self, skip, method, input, output): # GH#52335 result = pd.Categorical(input, ordered=True)._accumulate(method, skipna=skip) - tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) + tm.assert_extension_array_equal( + result, pd.Categorical(output, categories=[1, 2, 3], ordered=True) + ) From 126bc19ff3d451755066f2ee2cf747c02a2777ca Mon Sep 17 00:00:00 2001 From: bdwzhang Date: Sun, 21 Apr 2024 14:56:35 -0500 Subject: [PATCH 08/12] Updated whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7a4f709e56104..cc3a471def927 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -39,6 +39,7 @@ Other enhancements - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- Implement :meth:`ExtensionArray._accumulate` operations ``cummax`` and ``cummin`` in :class:`Categorical` (:issue:`52335`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From 18a86e269a95a6f7db64b698bb45e85a5bfbcfc2 Mon Sep 17 00:00:00 2001 From: bdwzhangumich <112042021+bdwzhangumich@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:31:48 -0400 Subject: [PATCH 09/12] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4a6d20be3519e..70ef5998fcb39 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -41,7 +41,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) -- Implement :meth:`ExtensionArray._accumulate` operations ``cummax`` and ``cummin`` in :class:`Categorical` (:issue:`52335`) +- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From 5c5ac3cec14032a0f6726b880b43cac46ce6a410 Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Mon, 22 Apr 2024 13:46:36 -0400 Subject: [PATCH 10/12] Removed testing for _accumulate. --- .../arrays/categorical/test_cumulative.py | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 pandas/tests/arrays/categorical/test_cumulative.py diff --git a/pandas/tests/arrays/categorical/test_cumulative.py b/pandas/tests/arrays/categorical/test_cumulative.py deleted file mode 100644 index ffd5a3b59beab..0000000000000 --- a/pandas/tests/arrays/categorical/test_cumulative.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Tests for Ordered Categorical Array cumulative operations. -""" - -import numpy as np -import pytest - -import pandas as pd -import pandas._testing as tm - - -class TestAccumulator: - @pytest.mark.parametrize( - "method, input, output", - [ - ["cummax", [1, 2, 1, 2, 3, 3, 2, 1], [1, 2, 2, 2, 3, 3, 3, 3]], - ["cummin", [3, 2, 3, 2, 1, 1, 2, 3], [3, 2, 2, 2, 1, 1, 1, 1]], - ], - ) - def test_cummax_cummin_on_ordered_categorical(self, method, input, output): - # GH#52335 - result = pd.Categorical(input, ordered=True)._accumulate(method) - tm.assert_extension_array_equal(result, pd.Categorical(output, ordered=True)) - - @pytest.mark.parametrize( - "method, skip, input, output", - [ - ["cummax", True, [1, np.nan, 2, 1, 3], [1, np.nan, 2, 2, 3]], - [ - "cummax", - False, - [1, np.nan, 2, 1, 3], - [1, np.nan, np.nan, np.nan, np.nan], - ], - ["cummin", True, [3, np.nan, 2, 3, 1], [3, np.nan, 2, 2, 1]], - [ - "cummin", - False, - [3, np.nan, 2, 3, 1], - [3, np.nan, np.nan, np.nan, np.nan], - ], - ], - ) - def test_cummax_cummin_ordered_categorical_nan(self, skip, method, input, output): - # GH#52335 - result = pd.Categorical(input, ordered=True)._accumulate(method, skipna=skip) - tm.assert_extension_array_equal( - result, pd.Categorical(output, categories=[1, 2, 3], ordered=True) - ) From 1abad3eed405fe08d03be9a78d3bf6989d20b527 Mon Sep 17 00:00:00 2001 From: bdwzhang Date: Mon, 22 Apr 2024 13:38:06 -0500 Subject: [PATCH 11/12] Moved categorical_accumulations.py logic to categorical.py --- .../array_algos/categorical_accumulations.py | 60 ------------------- pandas/core/arrays/categorical.py | 23 ++++--- 2 files changed, 16 insertions(+), 67 deletions(-) delete mode 100644 pandas/core/array_algos/categorical_accumulations.py diff --git a/pandas/core/array_algos/categorical_accumulations.py b/pandas/core/array_algos/categorical_accumulations.py deleted file mode 100644 index 34c68847d166f..0000000000000 --- a/pandas/core/array_algos/categorical_accumulations.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -categorical_accumulations.py is for accumulation algorithms using a mask-based -approach for missing values. -""" - -from __future__ import annotations - -from typing import Callable - -import numpy as np - - -def _cum_func( - func: Callable, - values: np.ndarray, - *, - skipna: bool = True, -) -> np.ndarray: - """ - Accumulations for 1D categorical arrays. - - We will modify values in place to replace NAs with the appropriate fill value. - - Parameters - ---------- - func : np.maximum.accumulate, np.minimum.accumulate - values : np.ndarray - Numpy integer array with the values and with NAs being -1. - skipna : bool, default True - Whether to skip NA. - """ - dtype_info = np.iinfo(values.dtype.type) - try: - fill_value = { - np.maximum.accumulate: dtype_info.min, - np.minimum.accumulate: dtype_info.max, - }[func] - except KeyError as err: - raise NotImplementedError( - f"No accumulation for {func} implemented on BaseMaskedArray" - ) from err - - mask = values == -1 - values[mask] = fill_value - - if not skipna: - mask = np.maximum.accumulate(mask) - - values = func(values) - values[mask] = -1 - - return values - - -def cummin(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: - return _cum_func(np.minimum.accumulate, values, skipna=skipna) - - -def cummax(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: - return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4f4b933b54182..6a3cf4590568c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Callable, Literal, cast, overload, @@ -71,7 +72,6 @@ factorize, take_nd, ) -from pandas.core.array_algos import categorical_accumulations from pandas.core.arrays._mixins import ( NDArrayBackedExtensionArray, ravel_compat, @@ -2510,16 +2510,25 @@ def equals(self, other: object) -> bool: return False def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> Self: - if name not in {"cummin", "cummax"}: + func: Callable + if name == "cummin": + func = np.minimum.accumulate + elif name == "cummax": + func = np.maximum.accumulate + else: raise TypeError(f"Accumulation {name} not supported for {type(self)}") - self.check_for_ordered(name) codes = self.codes.copy() - - op = getattr(categorical_accumulations, name) - codes = op(codes, skipna=skipna, **kwargs) - + mask = self.isna() + if func == np.minimum.accumulate: + codes[mask] = np.iinfo(codes.dtype.type).max + # no need to change codes for maximum because codes[mask] is already -1 + if not skipna: + mask = np.maximum.accumulate(mask) + + codes = func(codes) + codes[mask] = -1 return self._simple_new(codes, dtype=self._dtype) @classmethod From babc83548b454aca71557b3a8a9c7ac1cf17f9fb Mon Sep 17 00:00:00 2001 From: Christopher Xiang Date: Tue, 23 Apr 2024 14:36:16 -0400 Subject: [PATCH 12/12] Assigned expected results to expected variable; Added pytest.mark.parametrize to test_cummax_cummin_ordered_categorical_nan with skipna and expected data --- pandas/tests/series/test_cumulative.py | 39 +++++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index bfa8b6a11aafb..a9d5486139b46 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -181,11 +181,23 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): # GH#52335 cat = pd.CategoricalDtype(list(order), ordered=True) ser = pd.Series( - list("ababcab"), dtype=pd.CategoricalDtype(list(order), ordered=True) + list("ababcab"), + dtype=cat, ) result = getattr(ser, method)() - tm.assert_series_equal(result, pd.Series(list("abbbccc"), dtype=cat)) + expected = pd.Series( + list("abbbccc"), + dtype=cat, + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "skip, exp", + [ + [True, ["a", np.nan, "b", "b", "c"]], + [False, ["a", np.nan, np.nan, np.nan, np.nan]], + ], + ) @pytest.mark.parametrize( "method, order", [ @@ -193,28 +205,21 @@ def test_cummax_cummin_on_ordered_categorical(self, method, order): ["cummin", "cba"], ], ) - def test_cummax_cummin_ordered_categorical_nan(self, method, order): + def test_cummax_cummin_ordered_categorical_nan(self, skip, exp, method, order): # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) ser = pd.Series( ["a", np.nan, "b", "a", "c"], - dtype=pd.CategoricalDtype(list(order), ordered=True), + dtype=cat, ) - result = getattr(ser, method)(skipna=True) - tm.assert_series_equal( - result, - pd.Series( - ["a", np.nan, "b", "b", "c"], - dtype=pd.CategoricalDtype(list(order), ordered=True), - ), + result = getattr(ser, method)(skipna=skip) + expected = pd.Series( + exp, + dtype=cat, ) - - result = getattr(ser, method)(skipna=False) tm.assert_series_equal( result, - pd.Series( - ["a", np.nan, np.nan, np.nan, np.nan], - dtype=pd.CategoricalDtype(list(order), ordered=True), - ), + expected, ) def test_cumprod_timedelta(self):