From 1e7e563257c80258333535b729de11f08ca29c89 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 19 Apr 2023 06:57:16 +0100 Subject: [PATCH 01/65] ENH: better dtype inference when doing DataFrame reductions --- pandas/core/arrays/base.py | 13 ++- pandas/core/arrays/masked.py | 45 +++++++- pandas/core/frame.py | 4 +- pandas/core/internals/blocks.py | 5 +- pandas/tests/arrays/integer/test_reduction.py | 45 ++++++++ pandas/tests/frame/test_reductions.py | 104 +++++++++++++++++- 6 files changed, 204 insertions(+), 12 deletions(-) create mode 100644 pandas/tests/arrays/integer/test_reduction.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 27eb7994d3ccb..8b6c6ece6fe0e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1402,7 +1402,9 @@ def _accumulate( """ raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): """ Return a scalar result of performing the reduction operation. @@ -1432,7 +1434,14 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): f"'{type(self).__name__}' with dtype {self.dtype} " f"does not support reduction '{name}'" ) - return meth(skipna=skipna, **kwargs) + result = meth(skipna=skipna, **kwargs) + + if keepdims: + # if subclasses want to avoid wrapping in np.array, do: + # super()._reduce(..., keepdims=False) and wrap that. + return np.array([[result]]) + else: + return result # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 # Incompatible types in assignment (expression has type "None", base class diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index df9224e192140..7f580e5aa6851 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -41,10 +41,13 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_bool, + is_float_dtype, is_integer_dtype, is_list_like, is_scalar, is_string_dtype, + is_unsigned_integer_dtype, + is_signed_integer_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import BaseMaskedDtype @@ -1079,7 +1082,15 @@ def _quantile( # ------------------------------------------------------------------ # Reductions - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if keepdims: + res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs) + if res is libmissing.NA: + res = self._wrap_na_result(name) + return res + if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: return getattr(self, name)(skipna=skipna, **kwargs) @@ -1107,6 +1118,30 @@ def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): return self._maybe_mask_result(result, mask) return result + def _wrap_min_count_reduction_result( + self, name: str, result, skipna, min_count, **kwargs + ): + if min_count == 0 and isinstance(result, np.ndarray): + return self._maybe_mask_result(result, np.zeros(1, dtype=bool)) + return self._wrap_reduction_result(name, result, skipna, **kwargs) + + def _wrap_na_result(self, name): + mask = np.ones(1, dtype=bool) + + if is_float_dtype(self.dtype): + np_dtype = np.float64 + elif name in ["mean", "median", "var", "std", "skew"]: + np_dtype = np.float64 + elif is_signed_integer_dtype(self.dtype): + np_dtype = np.int64 + elif is_unsigned_integer_dtype(self.dtype): + np_dtype = np.uint64 + else: + raise TypeError(self.dtype) + + value = np.array([1], dtype=np_dtype) + return self._maybe_mask_result(value, mask=mask) + def sum( self, *, @@ -1124,8 +1159,8 @@ def sum( min_count=min_count, axis=axis, ) - return self._wrap_reduction_result( - "sum", result, skipna=skipna, axis=axis, **kwargs + return self._wrap_min_count_reduction_result( + "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs ) def prod( @@ -1144,8 +1179,8 @@ def prod( min_count=min_count, axis=axis, ) - return self._wrap_reduction_result( - "prod", result, skipna=skipna, axis=axis, **kwargs + return self._wrap_min_count_reduction_result( + "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs ) def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1521e78be0d4..b88616175c4ab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10848,7 +10848,7 @@ def blk_func(values, axis: Axis = 1): self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, **kwds) + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -10893,7 +10893,7 @@ def _get_data() -> DataFrame: out = out.astype(out_dtype) elif (df._mgr.get_dtypes() == object).any(): out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): + elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): # Even if we are object dtype, follow numpy and return # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7c5d686d96939..63f3ca1886d87 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -342,7 +342,10 @@ def reduce(self, func) -> list[Block]: if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs - res_values = np.array([[result]]) + if isinstance(result, (np.ndarray, ExtensionArray)): + res_values = result + else: + res_values = np.array([[result]]) else: res_values = result.reshape(-1, 1) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py new file mode 100644 index 0000000000000..08fe7ad624ae7 --- /dev/null +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + if op in {"sum", "prod", "min", "max"}: + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = getattr(df, op)() + assert isinstance(result["a"], np.int64) + diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b4a4324593d22..7e2caec53eb3c 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import ( + IS64, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -29,6 +32,8 @@ nanops, ) +is_windows_or_is32 = is_platform_windows() or not IS64 + def assert_stat_op_calc( opname, @@ -935,7 +940,7 @@ def test_mean_extensionarray_numeric_only_true(self): arr = np.random.randint(1000, size=(10, 5)) df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) - expected = DataFrame(arr).mean() + expected = DataFrame(arr).mean().astype("Float64") tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1668,6 +1673,101 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): getattr(np, method)(df, axis=0) +class TestEmptyDataFrameReductions: + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", np.int8, 0, np.int64), + ("prod", np.int8, 1, np.int_), + ("sum", np.int64, 0, np.int64), + ("prod", np.int64, 1, np.int64), + ("sum", np.uint8, 0, np.int64), + ("prod", np.uint8, 1, np.uint), + ("sum", np.uint64, 0, np.int64), + ("prod", np.uint64, 1, np.uint64), + ("sum", np.float32, 0, np.float32), + ("prod", np.float32, 1, np.float32), + ("sum", np.float64, 0, np.float64), + ], + ) + def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", np.int8, np.float64), + ("prod", np.int8, np.float64), + ("sum", np.int64, np.float64), + ("prod", np.int64, np.float64), + ("sum", np.uint8, np.float64), + ("prod", np.uint8, np.float64), + ("sum", np.uint64, np.float64), + ("prod", np.uint64, np.float64), + ("sum", np.float32, np.float32), + ("prod", np.float32, np.float32), + ("sum", np.float64, np.float64), + ], + ) + def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([np.nan, np.nan], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int64", 0, "Int64"), + ("prod", "Int64", 1, "Int64"), + ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt64", 0, "UInt64"), + ("prod", "UInt64", 1, "UInt64"), + ("sum", "Float32", 0, "Float32"), + ("prod", "Float32", 1, "Float32"), + ("sum", "Float64", 0, "Float64"), + ], + ) + def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", "Int8", "Int64"), + ("prod", "Int8", "Int64"), + ("sum", "Int64", "Int64"), + ("prod", "Int64", "Int64"), + ("sum", "UInt8", "UInt64"), + ("prod", "UInt8", "UInt64"), + ("sum", "UInt64", "UInt64"), + ("prod", "UInt64", "UInt64"), + ("sum", "Float32", "Float32"), + ("prod", "Float32", "Float32"), + ("sum", "Float64", "Float64"), + ], + ) + def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + def test_sum_timedelta64_skipna_false(using_array_manager, request): # GH#17235 if using_array_manager: From 6397977fbeaf2571bfcdc9bee2ffda9f03181cbb Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 19 Apr 2023 18:39:41 +0100 Subject: [PATCH 02/65] precommit issues --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7f580e5aa6851..f32fc84b295da 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -45,9 +45,9 @@ is_integer_dtype, is_list_like, is_scalar, + is_signed_integer_dtype, is_string_dtype, is_unsigned_integer_dtype, - is_signed_integer_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import BaseMaskedDtype From 0e797b99a447b011bca99772a381671b349ed494 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 19 Apr 2023 22:45:32 +0100 Subject: [PATCH 03/65] fix failures --- pandas/core/arrays/masked.py | 11 ++++------- pandas/core/arrays/sparse/array.py | 4 +++- pandas/tests/extension/base/dim2.py | 2 ++ pandas/tests/groupby/test_apply.py | 3 +-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f32fc84b295da..128dcef4daec4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -41,13 +41,10 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_bool, - is_float_dtype, is_integer_dtype, is_list_like, is_scalar, - is_signed_integer_dtype, is_string_dtype, - is_unsigned_integer_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import BaseMaskedDtype @@ -1122,19 +1119,19 @@ def _wrap_min_count_reduction_result( self, name: str, result, skipna, min_count, **kwargs ): if min_count == 0 and isinstance(result, np.ndarray): - return self._maybe_mask_result(result, np.zeros(1, dtype=bool)) + return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) return self._wrap_reduction_result(name, result, skipna, **kwargs) def _wrap_na_result(self, name): mask = np.ones(1, dtype=bool) - if is_float_dtype(self.dtype): + if self.dtype.kind == "f": np_dtype = np.float64 elif name in ["mean", "median", "var", "std", "skew"]: np_dtype = np.float64 - elif is_signed_integer_dtype(self.dtype): + elif self.dtype.kind == "i": np_dtype = np.int64 - elif is_unsigned_integer_dtype(self.dtype): + elif self.dtype.kind == "u": np_dtype = np.uint64 else: raise TypeError(self.dtype) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4f5505015ef76..1403b733cd0b1 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1373,7 +1373,9 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]: # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): method = getattr(self, name, None) if method is None: diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 85f01b1ee5d5e..38d454cfdbfc1 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -197,6 +197,8 @@ def test_reductions_2d_axis0(self, data, method): if method in ["std", "var"]: # pass ddof=0 so we get all-zero std instead of all-NA std kwargs["ddof"] = 0 + elif method in ["prod", "sum"]: + kwargs["min_count"] = 1 try: result = getattr(arr2d, method)(axis=0, **kwargs) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 832192d8a33e6..a9912d75c8978 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -890,8 +890,7 @@ def test_apply_multi_level_name(category): if category: b = pd.Categorical(b, categories=[1, 2, 3]) expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B") - # GH#40669 - summing an empty frame gives float dtype - expected_values = [20.0, 25.0, 0.0] + expected_values = [20, 25, 0] else: expected_index = Index([1, 2], name="B") expected_values = [20, 25] From b846e70f9d296c6732f62c10897fd00fa670aff7 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 19 Apr 2023 23:24:49 +0100 Subject: [PATCH 04/65] fix failures --- pandas/tests/frame/test_reductions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 7e2caec53eb3c..9f90367aff87a 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1747,12 +1747,12 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype @pytest.mark.parametrize( "opname, dtype, exp_dtype", [ - ("sum", "Int8", "Int64"), - ("prod", "Int8", "Int64"), + ("sum", "Int8", ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", ("Int32" if is_windows_or_is32 else "Int64")), ("sum", "Int64", "Int64"), ("prod", "Int64", "Int64"), - ("sum", "UInt8", "UInt64"), - ("prod", "UInt8", "UInt64"), + ("sum", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")), ("sum", "UInt64", "UInt64"), ("prod", "UInt64", "UInt64"), ("sum", "Float32", "Float32"), From 76ce5941c377acdc879ab24b90920a9ae7c84c6c Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 06:52:37 +0100 Subject: [PATCH 05/65] mypy + some docs --- pandas/core/arrays/base.py | 6 +++++- pandas/core/arrays/masked.py | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8b6c6ece6fe0e..5bc71385e7389 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1416,13 +1416,17 @@ def _reduce( std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. + keepdims: bool, default False + If True, wraps the reduced value in a ndarray/ExtensionArray before + returning it. + If False, returns the reduced value as-is. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. Returns ------- - scalar + scalar if keepdims is False else an ndarray/ExtensionArray Raises ------ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 128dcef4daec4..807eab606a4de 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1126,13 +1126,13 @@ def _wrap_na_result(self, name): mask = np.ones(1, dtype=bool) if self.dtype.kind == "f": - np_dtype = np.float64 + np_dtype = "float64" elif name in ["mean", "median", "var", "std", "skew"]: - np_dtype = np.float64 + np_dtype = "float64" elif self.dtype.kind == "i": - np_dtype = np.int64 + np_dtype = "int64" elif self.dtype.kind == "u": - np_dtype = np.uint64 + np_dtype = "uint64" else: raise TypeError(self.dtype) From 7644598a85b03ad12113ece370430ce2d90bd33b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 08:25:09 +0100 Subject: [PATCH 06/65] doc linting linting --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5bc71385e7389..f9c944b4fa3c1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1416,7 +1416,7 @@ def _reduce( std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. - keepdims: bool, default False + keepdims : bool, default False If True, wraps the reduced value in a ndarray/ExtensionArray before returning it. If False, returns the reduced value as-is. From 51da9eff083ebbf7373a8678fc53adb20378d449 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 11:50:08 +0100 Subject: [PATCH 07/65] refactor to use _reduce_with_wrap --- pandas/core/arrays/arrow/array.py | 7 +++++++ pandas/core/arrays/base.py | 31 +++++++++++++++--------------- pandas/core/arrays/masked.py | 18 ++++++++--------- pandas/core/arrays/sparse/array.py | 4 +--- pandas/core/frame.py | 2 +- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5a9e4a97eccea..ac49440ec5b52 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1533,6 +1533,13 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return result.as_py() + def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + """Takes the result of `_reduce` and wraps it an a ndarray/extensionArray.""" + # TODO: is there a way to do this without .as_py() + result = self._reduce(name, skipna=skipna, **kwargs) + result = pa.array([result.as_py()], type=result.type) + return type(self)(result) + def __setitem__(self, key, value) -> None: """Set one or more values inplace. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f9c944b4fa3c1..99ddb45ca4dfb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -136,6 +136,7 @@ class ExtensionArray: _from_sequence_of_strings _hash_pandas_object _reduce + _reduce_with_wrap _values_for_argsort _values_for_factorize @@ -1402,9 +1403,7 @@ def _accumulate( """ raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. @@ -1416,21 +1415,21 @@ def _reduce( std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. - keepdims : bool, default False - If True, wraps the reduced value in a ndarray/ExtensionArray before - returning it. - If False, returns the reduced value as-is. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. Returns ------- - scalar if keepdims is False else an ndarray/ExtensionArray + scalar Raises ------ TypeError : subclass does not define reductions + + See also + -------- + ExtensionArray._reduce_with_wrap """ meth = getattr(self, name, None) if meth is None: @@ -1438,14 +1437,16 @@ def _reduce( f"'{type(self).__name__}' with dtype {self.dtype} " f"does not support reduction '{name}'" ) - result = meth(skipna=skipna, **kwargs) + return meth(skipna=skipna, **kwargs) - if keepdims: - # if subclasses want to avoid wrapping in np.array, do: - # super()._reduce(..., keepdims=False) and wrap that. - return np.array([[result]]) - else: - return result + def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + """Calls `_reduce` and wraps the result in a ndarray/extensionArray. + + This is used to control the returned dtype when doing reductions in DataFrames, + and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``. + """ + result = self._reduce(name, skipna=skipna, **kwargs) + return np.array([[result]]) # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 # Incompatible types in assignment (expression has type "None", base class diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 807eab606a4de..5b4cbc7060ec5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1079,15 +1079,7 @@ def _quantile( # ------------------------------------------------------------------ # Reductions - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if keepdims: - res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs) - if res is libmissing.NA: - res = self._wrap_na_result(name) - return res - + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: return getattr(self, name)(skipna=skipna, **kwargs) @@ -1103,6 +1095,12 @@ def _reduce( return result + def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs) + if res is libmissing.NA: + res = self._wrap_na_result(name) + return res + def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): if isinstance(result, np.ndarray): axis = kwargs["axis"] @@ -1177,7 +1175,7 @@ def prod( axis=axis, ) return self._wrap_min_count_reduction_result( - "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs + "prod", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs ) def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 1403b733cd0b1..4f5505015ef76 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1373,9 +1373,7 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]: # Reductions # ------------------------------------------------------------------------ - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): method = getattr(self, name, None) if method is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b88616175c4ab..4425a7f08ba0f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10848,7 +10848,7 @@ def blk_func(values, axis: Axis = 1): self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + return values._reduce_with_wrap(name, skipna=skipna, kwargs=kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) From 8d925cdbf1898057c4553117af6f23144045af59 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 12:39:53 +0100 Subject: [PATCH 08/65] docstring linting --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 99ddb45ca4dfb..648196e89ed31 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1427,7 +1427,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions - See also + See Also -------- ExtensionArray._reduce_with_wrap """ From d7d19895c66c91c5f9c60952eeff359172a930ba Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 13:55:44 +0100 Subject: [PATCH 09/65] pyarrow failure + linting --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/base.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ac49440ec5b52..c236ab25a0619 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1537,7 +1537,7 @@ def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): """Takes the result of `_reduce` and wraps it an a ndarray/extensionArray.""" # TODO: is there a way to do this without .as_py() result = self._reduce(name, skipna=skipna, **kwargs) - result = pa.array([result.as_py()], type=result.type) + result = pa.array([result]) return type(self)(result) def __setitem__(self, key, value) -> None: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 648196e89ed31..ed77162a4146d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1430,6 +1430,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): See Also -------- ExtensionArray._reduce_with_wrap + Calls ``_reduce`` and wraps the result in a ndarray/extensionArray. """ meth = getattr(self, name, None) if meth is None: @@ -1440,7 +1441,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return meth(skipna=skipna, **kwargs) def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - """Calls `_reduce` and wraps the result in a ndarray/extensionArray. + """Calls ``_reduce`` and wraps the result in a ndarray/extensionArray. This is used to control the returned dtype when doing reductions in DataFrames, and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``. From 54bcb609ccd59050fd4a634a85761fa901acd29b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 13:56:02 +0100 Subject: [PATCH 10/65] pyarrow failure + linting --- pandas/core/arrays/arrow/array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c236ab25a0619..0636959773a7c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1535,7 +1535,6 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): """Takes the result of `_reduce` and wraps it an a ndarray/extensionArray.""" - # TODO: is there a way to do this without .as_py() result = self._reduce(name, skipna=skipna, **kwargs) result = pa.array([result]) return type(self)(result) From 03b8ce4278f37ce3f8b411dc43c4d5fb78ce52c1 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 13:57:18 +0100 Subject: [PATCH 11/65] linting --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0636959773a7c..03db657653d3a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1534,7 +1534,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return result.as_py() def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - """Takes the result of `_reduce` and wraps it an a ndarray/extensionArray.""" + """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray.""" result = self._reduce(name, skipna=skipna, **kwargs) result = pa.array([result]) return type(self)(result) From e0af36f78b0bed60b49d451872b09004c613d2fe Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 20 Apr 2023 15:11:55 +0100 Subject: [PATCH 12/65] doc stuff --- doc/source/reference/extensions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index b33efd388bd60..7909e0d4a4705 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -40,6 +40,7 @@ objects. api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object api.extensions.ExtensionArray._reduce + api.extensions.ExtensionArray._reduce_with_wrap api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize api.extensions.ExtensionArray.argsort From 64d8d605f142d68a50c0f3ed0fc2468a7d6995a8 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 21 Apr 2023 23:56:11 +0100 Subject: [PATCH 13/65] linting fixes --- pandas/core/arrays/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ed77162a4146d..656dc3b526ea9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1441,10 +1441,14 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return meth(skipna=skipna, **kwargs) def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - """Calls ``_reduce`` and wraps the result in a ndarray/extensionArray. + """Call ``_reduce`` and wrap the result in a ndarray/extensionArray. This is used to control the returned dtype when doing reductions in DataFrames, and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``. + + Returns + ------- + ndarray orx ExtensionArray """ result = self._reduce(name, skipna=skipna, **kwargs) return np.array([[result]]) From a95e5b9907feae2051689c6430a36a1e04c71838 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 22 Apr 2023 06:41:29 +0100 Subject: [PATCH 14/65] fix fix doc string --- pandas/core/arrays/base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 656dc3b526ea9..2e5cb45f18e55 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1441,14 +1441,24 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return meth(skipna=skipna, **kwargs) def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - """Call ``_reduce`` and wrap the result in a ndarray/extensionArray. + """ + Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray. This is used to control the returned dtype when doing reductions in DataFrames, and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``. Returns ------- - ndarray orx ExtensionArray + ndarray or ExtensionArray + + Examples + -------- + >>> import pandas as pd + >>> arr = pd.array([1, 2, pd.NA]) + >>> arr._reduce_with_wrap("sum", kwargs={}) + + [3] + Length: 1, dtype: Int64 """ result = self._reduce(name, skipna=skipna, **kwargs) return np.array([[result]]) From e7a75e47b7e33a403ae89a55c57105444b9b3109 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 22 Apr 2023 07:03:36 +0100 Subject: [PATCH 15/65] remove _wrap_na_result --- pandas/core/arrays/masked.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5b4cbc7060ec5..9e35f71c22ca2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1097,8 +1097,6 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs) - if res is libmissing.NA: - res = self._wrap_na_result(name) return res def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): @@ -1120,23 +1118,6 @@ def _wrap_min_count_reduction_result( return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) return self._wrap_reduction_result(name, result, skipna, **kwargs) - def _wrap_na_result(self, name): - mask = np.ones(1, dtype=bool) - - if self.dtype.kind == "f": - np_dtype = "float64" - elif name in ["mean", "median", "var", "std", "skew"]: - np_dtype = "float64" - elif self.dtype.kind == "i": - np_dtype = "int64" - elif self.dtype.kind == "u": - np_dtype = "uint64" - else: - raise TypeError(self.dtype) - - value = np.array([1], dtype=np_dtype) - return self._maybe_mask_result(value, mask=mask) - def sum( self, *, From 2e641918cc305d953960c58d5a0468e3a744aad0 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 23 Apr 2023 15:04:23 +0100 Subject: [PATCH 16/65] doc string example --- pandas/core/arrays/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2e5cb45f18e55..fd619119c4360 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1453,7 +1453,6 @@ def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): Examples -------- - >>> import pandas as pd >>> arr = pd.array([1, 2, pd.NA]) >>> arr._reduce_with_wrap("sum", kwargs={}) From b6c1dc87bde7f6c3aec07999df5c335413fb3c7d Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 24 Apr 2023 20:45:22 +0100 Subject: [PATCH 17/65] pyarrow + categorical --- pandas/core/array_algos/masked_reductions.py | 7 ++-- pandas/core/arrays/arrow/array.py | 4 +- pandas/core/arrays/categorical.py | 4 ++ pandas/core/arrays/masked.py | 42 +++++++++++++++---- pandas/core/frame.py | 4 +- .../arrays/categorical/test_analytics.py | 12 ++++++ pandas/tests/arrays/integer/test_reduction.py | 1 - pandas/tests/extension/base/reduce.py | 10 +++++ pandas/tests/extension/masked_shared.py | 21 ++++++++++ pandas/tests/extension/test_arrow.py | 32 ++++++++++++++ pandas/tests/frame/test_reductions.py | 4 +- 11 files changed, 123 insertions(+), 18 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 60a8d349984b9..5c3474222c128 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -119,14 +119,13 @@ def _minmax( # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: - return func(values) + return func(values, axis=axis) else: subset = values[~mask] - if subset.size: - return func(subset) - else: + if not subset.size: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA + return func(values, where=~mask, axis=axis, initial=subset[0]) def min( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 03db657653d3a..d11904b4c0031 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1535,8 +1535,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray.""" - result = self._reduce(name, skipna=skipna, **kwargs) - result = pa.array([result]) + result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) + result = pa.array([result.as_py()], type=result.type) return type(self)(result) def __setitem__(self, key, value) -> None: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eb21fae29612..fef82e1bb96a2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2099,6 +2099,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: # ------------------------------------------------------------------ # Reductions + def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + result = self._reduce(name, skipna=skipna, **kwargs) + return type(self)([result], dtype=self.dtype) + def min(self, *, skipna: bool = True, **kwargs): """ The minimum value of the object. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9e35f71c22ca2..aa220aac4960a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -55,6 +55,7 @@ notna, ) +import pandas as pd from pandas.core import ( algorithms as algos, arraylike, @@ -1088,20 +1089,22 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # median, skew, kurt, sem op = getattr(nanops, f"nan{name}") - result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) - + axis = kwargs.pop("axis", None) + result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) if np.isnan(result): - return libmissing.NA + result = libmissing.NA - return result + return self._wrap_reduction_result( + name, result, skipna=skipna, axis=axis, **kwargs + ) def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs) + res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, axis=0, **kwargs) return res def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): + axis = kwargs["axis"] if isinstance(result, np.ndarray): - axis = kwargs["axis"] if skipna: # we only retain mask for all-NA rows/columns mask = self._mask.all(axis=axis) @@ -1109,8 +1112,25 @@ def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): mask = self._mask.any(axis=axis) return self._maybe_mask_result(result, mask) + elif result is pd.NA and self.ndim == 2: + result = self._wrap_na_result(name=name, axis=axis) + return result return result + def _wrap_na_result(self, *, name, axis): + mask_size = self.shape[1] if axis == 0 else self.shape[0] + mask = np.ones(mask_size, dtype=bool) + + if name in ["mean", "median", "var", "std", "skew"]: + np_dtype = "float64" + elif name in ["min", "max"]: + np_dtype = self.dtype.type + else: + np_dtype = {"i": "int64", "u": "uint64", "f": "float64"}[self.dtype.kind] + + value = np.array([1], dtype=np_dtype) + return self._maybe_mask_result(value, mask=mask) + def _wrap_min_count_reduction_result( self, name: str, result, skipna, min_count, **kwargs ): @@ -1203,21 +1223,27 @@ def std( def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_min((), kwargs) - return masked_reductions.min( + result = masked_reductions.min( self._data, self._mask, skipna=skipna, axis=axis, ) + return self._wrap_reduction_result( + "min", result, skipna=skipna, axis=axis, **kwargs + ) def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_max((), kwargs) - return masked_reductions.max( + result = masked_reductions.max( self._data, self._mask, skipna=skipna, axis=axis, ) + return self._wrap_reduction_result( + "max", result, skipna=skipna, axis=axis, **kwargs + ) def any(self, *, skipna: bool = True, **kwargs): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4425a7f08ba0f..d73959201f614 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11157,7 +11157,7 @@ def idxmin( # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy + assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] @@ -11187,7 +11187,7 @@ def idxmax( # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy + assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 057005b30ae20..27310dc9546e6 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -56,6 +56,18 @@ def test_min_max_ordered(self, index_or_series_or_array): assert np.minimum.reduce(obj) == "d" assert np.maximum.reduce(obj) == "a" + def test_min_max_reduce_with_wrap(self): + # GH52788 + cat = Categorical(["a", "b", "c", "d"], ordered=True) + + result_max = cat._reduce_with_wrap("max", kwargs={}) + expected_max = Categorical(["d"], dtype=cat.dtype) + tm.assert_categorical_equal(result_max, expected_max) + + result_min = cat._reduce_with_wrap("min", kwargs={}) + expected_min = Categorical(["a"], dtype=cat.dtype) + tm.assert_categorical_equal(result_min, expected_min) + @pytest.mark.parametrize( "categories,expected", [ diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 08fe7ad624ae7..f4e0706c30175 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -42,4 +42,3 @@ def test_dataframe_reductions(op): df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) result = getattr(df, op)() assert isinstance(result["a"], np.int64) - diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index cf161a7f4b906..c55c10d192e68 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_numeric_dtype from pandas.tests.extension.base.base import BaseExtensionTests @@ -66,6 +67,15 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): warnings.simplefilter("ignore", RuntimeWarning) self.check_reduce(s, op_name, skipna) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_with_wrap(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + if not is_numeric_dtype(s): + pytest.skip("not numeric dtype") + + self.check_reduce_with_wrap(s, op_name, skipna) + class BaseBooleanReduceTests(BaseReduceTests): @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 4c6ce20379419..3ad20b8848a0c 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -64,6 +64,27 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) + def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name in ["count", "kurt", "sem"]: + pytest.skip(f"{op_name} not an array method") + + arr = ser.array + + if op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" + elif op_name in ["max", "min"]: + cmp_dtype = arr.dtype + else: + cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind] + + result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() + expected = pd.array([exp_value], dtype=cmp_dtype) + tm.assert_extension_array_equal(result, expected) + class Accumulation(base.BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9129e84700a55..d6984eaf9ab88 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -508,6 +508,38 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) + def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name in ["count", "kurt", "sem", "skew"]: + pytest.skip(f"{op_name} not an array method") + + arr = ser.array + kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + + if op_name in ["max", "min"]: + cmp_dtype = arr.dtype + elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": + if op_name not in ["median", "var", "std"]: + cmp_dtype = arr.dtype + else: + cmp_dtype = "float64[pyarrow]" + elif op_name in ["median", "var", "std", "mean", "skew"]: + cmp_dtype = "float64[pyarrow]" + else: + cmp_dtype = { + "i": "int64[pyarrow]", + "u": "uint64[pyarrow]", + "f": "float64[pyarrow]", + }[arr.dtype.kind] + result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs=kwargs) + + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)(**kwargs) + expected = pd.array([exp_value], dtype=cmp_dtype) + + tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) def test_median_not_approximate(self, typ): # GH 52679 diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 9f90367aff87a..08645ac1827ae 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1820,7 +1820,9 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") + [getattr(int64_info, method)], + dtype="Int64", + index=Index(["Int64"], dtype="object"), ) tm.assert_series_equal(result, expected) From 32f9a73b58fe184c4368f90bbe12f8d0f4924a00 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 08:45:21 +0100 Subject: [PATCH 18/65] silence bugs --- pandas/core/array_algos/masked_reductions.py | 2 +- pandas/core/arrays/masked.py | 5 +++-- pandas/tests/extension/decimal/test_decimal.py | 4 ++++ pandas/tests/extension/masked_shared.py | 5 +++-- pandas/tests/extension/test_boolean.py | 4 ++++ pandas/tests/extension/test_numpy.py | 4 ++++ 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 5c3474222c128..2862e0fc5008d 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -52,7 +52,7 @@ def _reductions( axis : int, optional, default None """ if not skipna: - if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count): + if mask.any() or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: return func(values, axis=axis, **kwargs) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index aa220aac4960a..e5dfe85d66fed 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1121,12 +1121,13 @@ def _wrap_na_result(self, *, name, axis): mask_size = self.shape[1] if axis == 0 else self.shape[0] mask = np.ones(mask_size, dtype=bool) + float_dtype = "float32" if self.dtype == "Float32" else "float64" if name in ["mean", "median", "var", "std", "skew"]: - np_dtype = "float64" + np_dtype = float_dtype elif name in ["min", "max"]: np_dtype = self.dtype.type else: - np_dtype = {"i": "int64", "u": "uint64", "f": "float64"}[self.dtype.kind] + np_dtype = {"i": "int64", "u": "uint64", "f": float_dtype}[self.dtype.kind] value = np.array([1], dtype=np_dtype) return self._maybe_mask_result(value, mask=mask) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index afd04817f05c7..a51a4a7dd2888 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -115,6 +115,10 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(np.asarray(s), op_name)() tm.assert_almost_equal(result, expected) + @pytest.mark.skip("tests not written yet") + def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + pass + class TestNumericReduce(Reduce, base.BaseNumericReduceTests): pass diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 3ad20b8848a0c..20dfef43b12f4 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -69,13 +69,14 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): pytest.skip(f"{op_name} not an array method") arr = ser.array + float_dtype = "Float32" if arr.dtype == "Float32" else "Float64" if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" + cmp_dtype = float_dtype elif op_name in ["max", "min"]: cmp_dtype = arr.dtype else: - cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind] + cmp_dtype = {"i": "Int64", "u": "UInt64", "f": float_dtype}[arr.dtype.kind] result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index c9fa28a507745..fd8062c101212 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -370,6 +370,10 @@ def check_reduce(self, s, op_name, skipna): expected = bool(expected) tm.assert_almost_equal(result, expected) + @pytest.mark.skip("tests not written yet") + def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + pass + class TestBooleanReduce(base.BaseBooleanReduceTests): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 16b05be2e0bb9..a512707a2b032 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -323,6 +323,10 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) + @pytest.mark.skip("tests not written yet") + def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + pass + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_boolean_reductions, skipna): super().test_reduce_series(data, all_boolean_reductions, skipna) From 8bf7ba8ff1df3ca315bbd8f2f094f76a5643f19a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 12:24:03 +0100 Subject: [PATCH 19/65] silence errors --- pandas/tests/extension/masked_shared.py | 6 ++++-- pandas/tests/extension/test_arrow.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 20dfef43b12f4..1ec9e5e2e4152 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -69,12 +69,14 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): pytest.skip(f"{op_name} not an array method") arr = ser.array - float_dtype = "Float32" if arr.dtype == "Float32" else "Float64" + + float32_cond = arr.dtype == "Float32" and not is_platform_windows() + float_dtype = "Float32" if float32_cond else "Float64" if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = float_dtype elif op_name in ["max", "min"]: - cmp_dtype = arr.dtype + cmp_dtype = arr.dtype.name else: cmp_dtype = {"i": "Int64", "u": "UInt64", "f": float_dtype}[arr.dtype.kind] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d6984eaf9ab88..93f57f26462ca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -516,10 +516,10 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} if op_name in ["max", "min"]: - cmp_dtype = arr.dtype + cmp_dtype = arr.dtype.name elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": if op_name not in ["median", "var", "std"]: - cmp_dtype = arr.dtype + cmp_dtype = arr.dtype.name else: cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: From 35b07c50aaaa432bd534f431d105cb555512e9a0 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 13:37:06 +0100 Subject: [PATCH 20/65] silence errors II --- pandas/core/arrays/masked.py | 4 +++- pandas/tests/extension/masked_shared.py | 2 +- pandas/tests/extension/test_arrow.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e5dfe85d66fed..9be2f9edf3f02 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1125,7 +1125,9 @@ def _wrap_na_result(self, *, name, axis): if name in ["mean", "median", "var", "std", "skew"]: np_dtype = float_dtype elif name in ["min", "max"]: - np_dtype = self.dtype.type + # Incompatible types in assignment (expression has type "str", variable has + # type "Union[dtype[Any], ExtensionDtype]") + np_dtype = self.dtype.type # type: ignore[assignment] else: np_dtype = {"i": "int64", "u": "uint64", "f": float_dtype}[self.dtype.kind] diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 1ec9e5e2e4152..78f562ceff5eb 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -70,7 +70,7 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array - float32_cond = arr.dtype == "Float32" and not is_platform_windows() + float32_cond = arr.dtype == "Float32" and is_platform_windows() or not IS64 float_dtype = "Float32" if float32_cond else "Float64" if op_name in ["mean", "median", "var", "std", "skew"]: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 93f57f26462ca..0a3b0687e9b59 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -45,6 +45,7 @@ CategoricalDtypeType, ) +from pandas._typing import Dtype import pandas as pd import pandas._testing as tm from pandas.api.types import ( @@ -516,10 +517,10 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} if op_name in ["max", "min"]: - cmp_dtype = arr.dtype.name + cmp_dtype: Dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": if op_name not in ["median", "var", "std"]: - cmp_dtype = arr.dtype.name + cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: From 6a390d495029f1f4b59f357d8665bf960dda624c Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 13:50:38 +0100 Subject: [PATCH 21/65] fix errors III --- pandas/tests/extension/test_arrow.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0a3b0687e9b59..62d7558b43136 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -45,7 +45,6 @@ CategoricalDtypeType, ) -from pandas._typing import Dtype import pandas as pd import pandas._testing as tm from pandas.api.types import ( @@ -509,7 +508,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) - def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_with_wrap(self, ser, op_name, skipna): if op_name in ["count", "kurt", "sem", "skew"]: pytest.skip(f"{op_name} not an array method") @@ -517,7 +516,7 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} if op_name in ["max", "min"]: - cmp_dtype: Dtype = arr.dtype + cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": if op_name not in ["median", "var", "std"]: cmp_dtype = arr.dtype From 8dc2acf0f7d15827c8246982de24cf484d5fe2dc Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 14:11:58 +0100 Subject: [PATCH 22/65] various fixups --- pandas/core/array_algos/masked_reductions.py | 5 +++-- pandas/core/arrays/base.py | 2 +- pandas/core/frame.py | 8 +++----- pandas/core/internals/blocks.py | 7 +++---- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 2862e0fc5008d..9ab4cb51cfdd8 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -122,10 +122,11 @@ def _minmax( return func(values, axis=axis) else: subset = values[~mask] - if not subset.size: + if subset.size: + return func(values, where=~mask, axis=axis, initial=subset[0]) # min/max with empty array raise in numpy, pandas returns NA + else: return libmissing.NA - return func(values, where=~mask, axis=axis, initial=subset[0]) def min( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fd619119c4360..d3968bf79227e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1430,7 +1430,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): See Also -------- ExtensionArray._reduce_with_wrap - Calls ``_reduce`` and wraps the result in a ndarray/extensionArray. + Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray. """ meth = getattr(self, name, None) if meth is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d73959201f614..c988b0b157d67 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11154,10 +11154,9 @@ def idxmin( ) indices = res._values - # indices will always be np.ndarray since axis is not None and + # indices will always be 1d array since axis is not None and # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy + # indices will always be np.ndarray since axis is not N index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] @@ -11184,9 +11183,8 @@ def idxmax( ) indices = res._values - # indices will always be np.ndarray since axis is not None and + # indices will always be 1d array since axis is not None and # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy index = data._get_axis(axis) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 63f3ca1886d87..716432c7e78e2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -342,10 +342,9 @@ def reduce(self, func) -> list[Block]: if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs - if isinstance(result, (np.ndarray, ExtensionArray)): - res_values = result - else: - res_values = np.array([[result]]) + if not isinstance(result, (np.ndarray, ExtensionArray)): + result = np.array([[result]]) + res_values = result else: res_values = result.reshape(-1, 1) From 5a65c7008be8d38936261c002bd7d698cc874f72 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 16:16:17 +0100 Subject: [PATCH 23/65] various fixups --- pandas/tests/extension/masked_shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 78f562ceff5eb..a1ef012b82bd9 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -70,7 +70,7 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array - float32_cond = arr.dtype == "Float32" and is_platform_windows() or not IS64 + float32_cond = arr.dtype == "Float32" or (is_platform_windows() or not IS64) float_dtype = "Float32" if float32_cond else "Float64" if op_name in ["mean", "median", "var", "std", "skew"]: From 9cb34ec008f789f25f1a4481b70fe87638b11e09 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 26 Apr 2023 06:38:40 +0100 Subject: [PATCH 24/65] delay fixing windows and 32bit failures --- pandas/core/arrays/masked.py | 3 +-- pandas/tests/extension/masked_shared.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9be2f9edf3f02..1f81fd009eb7d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -55,7 +55,6 @@ notna, ) -import pandas as pd from pandas.core import ( algorithms as algos, arraylike, @@ -1112,7 +1111,7 @@ def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): mask = self._mask.any(axis=axis) return self._maybe_mask_result(result, mask) - elif result is pd.NA and self.ndim == 2: + elif result is libmissing.NA and self.ndim == 2: result = self._wrap_na_result(name=name, axis=axis) return result return result diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index a1ef012b82bd9..290a4cd777360 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -67,18 +67,19 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") + elif is_platform_windows() or not IS64: + pytest.skip("tests for platform not written yet") arr = ser.array - float32_cond = arr.dtype == "Float32" or (is_platform_windows() or not IS64) - float_dtype = "Float32" if float32_cond else "Float64" - - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = float_dtype + if tm.is_float_dtype(arr.dtype): + cmp_dtype = arr.dtype + elif op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" elif op_name in ["max", "min"]: - cmp_dtype = arr.dtype.name + cmp_dtype = arr.dtype else: - cmp_dtype = {"i": "Int64", "u": "UInt64", "f": float_dtype}[arr.dtype.kind] + cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind] result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): From 8521f189fe12ee29db12935f38db9f291ac24193 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 23 Apr 2023 14:51:39 +0100 Subject: [PATCH 25/65] BUG: Adding a columns to a Frame with RangeIndex columns using a non-scalar key (#52877) --- doc/source/whatsnew/v2.0.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 2613d12e43400..3285d54f8490d 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) +- Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) - Fixed regression when adding a new column to a :class:`DataFrame` when the :attr:`DataFrame.columns` was a :class:`RangeIndex` and the new key was hashable but not a scalar (:issue:`52652`) .. --------------------------------------------------------------------------- From 82cd91e07b644c278a3e5c35feedf5511a75b3f4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 00:50:11 +0200 Subject: [PATCH 26/65] DOC: Update whatsnew (#52882) --- doc/source/whatsnew/v2.0.1.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 3285d54f8490d..2613d12e43400 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,7 +20,6 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) -- Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) - Fixed regression when adding a new column to a :class:`DataFrame` when the :attr:`DataFrame.columns` was a :class:`RangeIndex` and the new key was hashable but not a scalar (:issue:`52652`) .. --------------------------------------------------------------------------- From e0bc63ec189f6ff97f831cf076a82a09b821bd91 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:30:37 +0200 Subject: [PATCH 27/65] CI: Change development python version to 3.10 (#51133) * CI: Change development python version to 3.10 * Update checks * Remove strict * Remove strict * Fixes * Add dt * Switch python to 3.9 * Remove * Fix * Try attribute * Adjust * Fix mypy * Try fixing doc build * Fix mypy * Fix stubtest * Remove workflow file * Rename back * Update * Rename * Rename * Change python version * Remove * Fix doc errors * Remove pypy * Update ci/deps/actions-pypy-39.yaml Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> * Revert pypy removal * Remove again * Fix * Change to 3.9 * Address --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/sdist.yml | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 .github/workflows/sdist.yml diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml new file mode 100644 index 0000000000000..957e7103f4ff6 --- /dev/null +++ b/.github/workflows/sdist.yml @@ -0,0 +1,94 @@ +name: sdist + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +jobs: + build: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + timeout-minutes: 60 + defaults: + run: + shell: bash -el {0} + + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist + cancel-in-progress: true + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install versioneer[toml] + + # GH 39416 + pip install numpy + + - name: Build pandas sdist + run: | + pip list + python setup.py sdist --formats=gztar + + - name: Upload sdist artifact + uses: actions/upload-artifact@v3 + with: + name: ${{matrix.python-version}}-sdist.gz + path: dist/*.gz + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: false + environment-name: pandas-sdist + extra-specs: | + python =${{ matrix.python-version }} + + - name: Install pandas from sdist + run: | + pip list + python -m pip install dist/*.gz + + - name: Force oldest supported NumPy + run: | + case "${{matrix.python-version}}" in + 3.9) + pip install numpy==1.21.6 ;; + 3.10) + pip install numpy==1.21.6 ;; + 3.11) + pip install numpy==1.23.2 ;; + esac + + - name: Import pandas + run: | + cd .. + python -c "import pandas; pandas.show_versions();" From 7cf26ae3a19242a7c67d75d258ebfc6676208836 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 27 Apr 2023 17:24:13 +0100 Subject: [PATCH 28/65] update --- pandas/tests/extension/masked_shared.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 290a4cd777360..7436d555c543e 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -73,11 +73,11 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array if tm.is_float_dtype(arr.dtype): - cmp_dtype = arr.dtype + cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: - cmp_dtype = arr.dtype + cmp_dtype = arr.dtype.name else: cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind] From 6330840114a75ed7f49c61b5185dadf41144983f Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 29 Apr 2023 14:22:02 +0100 Subject: [PATCH 29/65] update --- pandas/core/arrays/masked.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1f81fd009eb7d..7b8a3d0e4fd77 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1098,7 +1098,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ) def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): - res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, axis=0, **kwargs) + df = self.reshape(-1, 1) + res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs) return res def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): From efae9dc8ee200ba76149b682f32eb63b6816c4ce Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 1 May 2023 08:46:03 +0100 Subject: [PATCH 30/65] add docs --- doc/source/user_guide/integer_na.rst | 9 ++++++- doc/source/user_guide/pyarrow.rst | 6 +++++ doc/source/whatsnew/v2.1.0.rst | 39 +++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a3ccb5b0d4019..539670444152b 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -126,13 +126,20 @@ These dtypes can be merged, reshaped & casted. pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes df["A"].astype(float) -Reduction and groupby operations such as 'sum' work as well. +Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. .. ipython:: python + df.sum(numeric_only=True) df.sum() df.groupby("B").A.sum() +.. versionchanged:: 2.1.0 + + When doing reduction operations (:meth:`~DataFrame.sum` etc.) on numeric-only data + frames the integer array dtype will be maintained. Previously, the dtype of reduction + result would have been a numpy numeric dtype. + Scalar NA Value --------------- diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 61b383afb7c43..5caf080c5ca2e 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -152,6 +152,12 @@ The following are just some examples of operations that are accelerated by nativ ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type) ser_dt.dt.strftime("%Y-%m") +.. versionchanged:: 2.1.0 + + When doing :class:`DataFrame` reduction operations (:meth:`~DataFrame.sum` etc.) on + pyarrow data the dtype now will be maintained when possible. Previously, the dtype + of reduction result would have been a numpy numeric dtype. + I/O Reading ----------- diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..7e82b7294ca42 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,10 +14,43 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.reduction_extension_dtypes: -enhancement1 -^^^^^^^^^^^^ +Reductions maintain extension dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, the results of DataFrame reductions +(:meth:`DataFrameG.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames +were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe +columns with a common dtype (:issue:``52788`). + +*New Behavior* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64") + In [2]: df.sum() + Out[2]: + a 5 + b 9 + dtype: int64 + In [3]: df = df.astype("int64[pyarrow]") + In [4]: df.sum() + Out[4]: + a 5 + b 9 + dtype: int64 + +*New Behavior* + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64") + df.sum() + df = df.astype("int64[pyarrow]") + df.sum() + +Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype. .. _whatsnew_210.enhancements.enhancement2: From b585f3bc2a0922c8a2abcf9131153caec24ee59e Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 1 May 2023 21:19:42 +0100 Subject: [PATCH 31/65] fix windows tests --- .github/workflows/sdist.yml | 94 ------------------------- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/extension/masked_shared.py | 4 +- 3 files changed, 3 insertions(+), 97 deletions(-) delete mode 100644 .github/workflows/sdist.yml diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml deleted file mode 100644 index 957e7103f4ff6..0000000000000 --- a/.github/workflows/sdist.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: sdist - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - "doc/**" - - "web/**" - -permissions: - contents: read - -jobs: - build: - if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 - timeout-minutes: 60 - defaults: - run: - shell: bash -el {0} - - strategy: - fail-fast: false - matrix: - python-version: ["3.9", "3.10", "3.11"] - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - python -m pip install versioneer[toml] - - # GH 39416 - pip install numpy - - - name: Build pandas sdist - run: | - pip list - python setup.py sdist --formats=gztar - - - name: Upload sdist artifact - uses: actions/upload-artifact@v3 - with: - name: ${{matrix.python-version}}-sdist.gz - path: dist/*.gz - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: false - environment-name: pandas-sdist - extra-specs: | - python =${{ matrix.python-version }} - - - name: Install pandas from sdist - run: | - pip list - python -m pip install dist/*.gz - - - name: Force oldest supported NumPy - run: | - case "${{matrix.python-version}}" in - 3.9) - pip install numpy==1.21.6 ;; - 3.10) - pip install numpy==1.21.6 ;; - 3.11) - pip install numpy==1.23.2 ;; - esac - - - name: Import pandas - run: | - cd .. - python -c "import pandas; pandas.show_versions();" diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7e82b7294ca42..036abb0a258ae 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -22,7 +22,7 @@ Reductions maintain extension dtypes In previous versions of pandas, the results of DataFrame reductions (:meth:`DataFrameG.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe -columns with a common dtype (:issue:``52788`). +columns with a common dtype (:issue:`52788`). *New Behavior* diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 7436d555c543e..87a556828699e 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -67,8 +67,8 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") - elif is_platform_windows() or not IS64: - pytest.skip("tests for platform not written yet") + elif not (IS64 and not is_platform_windows()): + pytest.skip("tests for platform not written for 32bit Linux") arr = ser.array From 52763abb2214bea1f22fe5ee4af45d7b6103827d Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 1 May 2023 22:08:08 +0100 Subject: [PATCH 32/65] fix windows tests --- pandas/tests/extension/masked_shared.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 87a556828699e..d4ee2bbe2e48a 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -67,7 +67,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") - elif not (IS64 and not is_platform_windows()): + elif not (IS64 or is_platform_windows()): pytest.skip("tests for platform not written for 32bit Linux") arr = ser.array @@ -78,8 +78,16 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name + elif arr.dtype == "Int64": + cmp_dtype = "Int64" + elif arr.dtype == "UInt64": + cmp_dtype = "UInt64" + elif tm.is_signed_integer_dtype(arr.dtype): + cmp_dtype = "Int32" if skipna and is_platform_windows() else "Int64" + elif tm.is_unsigned_integer_dtype(arr.dtype): + cmp_dtype = "UInt32" if skipna and is_platform_windows() else "UInt64" else: - cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind] + raise TypeError("not supposed to reach this") result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): From d4f2a84f4787feb563ac3191fec8e5d5d4ecb790 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 05:32:47 +0100 Subject: [PATCH 33/65] remove guards for 32bit linux --- pandas/tests/extension/masked_shared.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index d4ee2bbe2e48a..9c01cbb660a77 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -67,8 +67,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") - elif not (IS64 or is_platform_windows()): - pytest.skip("tests for platform not written for 32bit Linux") arr = ser.array @@ -78,10 +76,8 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name - elif arr.dtype == "Int64": - cmp_dtype = "Int64" - elif arr.dtype == "UInt64": - cmp_dtype = "UInt64" + elif arr.dtype == "Int64" or arr.dtype == "UInt64": + cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): cmp_dtype = "Int32" if skipna and is_platform_windows() else "Int64" elif tm.is_unsigned_integer_dtype(arr.dtype): From 7bfe3feee9b788524fe321db851bb36e2ce0bae1 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 06:36:26 +0100 Subject: [PATCH 34/65] add bool tests + fix 32-bit failures --- pandas/core/arrays/masked.py | 7 ++++++ pandas/tests/extension/masked_shared.py | 8 ++++--- pandas/tests/extension/test_boolean.py | 29 +++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7b8a3d0e4fd77..1177c93af5289 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1151,6 +1151,9 @@ def sum( ): nv.validate_sum((), kwargs) + if self._data.dtype.kind == "b": + self = self.astype("Int8") + result = masked_reductions.sum( self._data, self._mask, @@ -1171,6 +1174,10 @@ def prod( **kwargs, ): nv.validate_prod((), kwargs) + + if self._data.dtype.kind == "b": + self = self.astype("Int8") + result = masked_reductions.prod( self._data, self._mask, diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 9c01cbb660a77..329478b4f8dd1 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -70,18 +70,20 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array + is_windows_or_32bit = is_platform_windows() or not IS64 + if tm.is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name - elif arr.dtype == "Int64" or arr.dtype == "UInt64": + elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if skipna and is_platform_windows() else "Int64" + cmp_dtype = "Int32" if skipna and is_windows_or_32bit else "Int64" elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if skipna and is_platform_windows() else "UInt64" + cmp_dtype = "UInt32" if skipna and is_windows_or_32bit else "UInt64" else: raise TypeError("not supposed to reach this") diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index fd8062c101212..a8b84df3de1c9 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -26,6 +26,11 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.tests.extension import base +from pandas.compat import ( + IS64, + is_platform_windows, +) + def make_data(): return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] @@ -370,9 +375,29 @@ def check_reduce(self, s, op_name, skipna): expected = bool(expected) tm.assert_almost_equal(result, expected) - @pytest.mark.skip("tests not written yet") def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): - pass + if op_name in ["count", "kurt", "sem"]: + pytest.skip(f"{op_name} not an array method") + + arr = ser.array + + if op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" + elif op_name in ["min", "max"]: + cmp_dtype = "boolean" + elif op_name in ["sum", "prod"]: + is_windows_or_32bit = is_platform_windows() or not IS64 + cmp_dtype = "Int32" if skipna and is_windows_or_32bit else "Int64" + else: + raise TypeError("not supposed to reach this") + + result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() + expected = pd.array([exp_value], dtype=cmp_dtype) + tm.assert_extension_array_equal(result, expected) class TestBooleanReduce(base.BaseBooleanReduceTests): From f48ea0982bc149f8c21901ad5139a60dfe8e37ea Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 06:52:50 +0100 Subject: [PATCH 35/65] fix pre-commit failures --- pandas/tests/extension/test_boolean.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index a8b84df3de1c9..cb5ec2f785b1a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -26,11 +26,6 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.tests.extension import base -from pandas.compat import ( - IS64, - is_platform_windows, -) - def make_data(): return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] From bbd8cb810e949129abf81e3405ba3c865281c150 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 09:43:21 +0100 Subject: [PATCH 36/65] fix mypy failures --- pandas/core/arrays/boolean.py | 23 +++++++++++++++++++++++ pandas/core/arrays/masked.py | 6 ------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 43344f04085ae..b4a5a38ffe608 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -28,6 +28,7 @@ import pyarrow from pandas._typing import ( + AxisInt, Dtype, DtypeObj, Self, @@ -404,3 +405,25 @@ def _accumulate( return IntegerArray(data.astype(int), mask)._accumulate( name, skipna=skipna, **kwargs ) + + def sum( + self, + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = 0, + **kwargs, + ): + arr = cast(BaseMaskedArray, self.astype("Int8")) + return arr.sum(skipna=skipna, min_count=min_count, axis=axis, **kwargs) + + def prod( + self, + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = 0, + **kwargs, + ): + arr = cast(BaseMaskedArray, self.astype("Int8")) + return arr.prod(skipna=skipna, min_count=min_count, axis=axis, **kwargs) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1177c93af5289..a2d0997e0df01 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1151,9 +1151,6 @@ def sum( ): nv.validate_sum((), kwargs) - if self._data.dtype.kind == "b": - self = self.astype("Int8") - result = masked_reductions.sum( self._data, self._mask, @@ -1175,9 +1172,6 @@ def prod( ): nv.validate_prod((), kwargs) - if self._data.dtype.kind == "b": - self = self.astype("Int8") - result = masked_reductions.prod( self._data, self._mask, From c6e9a80194db13b405e67c8a2fdba5f2281504d6 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 09:51:27 +0100 Subject: [PATCH 37/65] rename _reduce_with -> _reduce_and_wrap --- doc/source/reference/extensions.rst | 2 +- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/base.py | 9 +++++---- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/frame.py | 2 +- pandas/tests/arrays/categorical/test_analytics.py | 6 +++--- pandas/tests/extension/base/reduce.py | 4 ++-- pandas/tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/masked_shared.py | 4 ++-- pandas/tests/extension/test_arrow.py | 4 ++-- pandas/tests/extension/test_boolean.py | 4 ++-- pandas/tests/extension/test_numpy.py | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 7909e0d4a4705..3c9056c956d7d 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -40,7 +40,7 @@ objects. api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object api.extensions.ExtensionArray._reduce - api.extensions.ExtensionArray._reduce_with_wrap + api.extensions.ExtensionArray._reduce_and_wrap api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize api.extensions.ExtensionArray.argsort diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d11904b4c0031..d5a22b0430925 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1533,7 +1533,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return result.as_py() - def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray.""" result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) result = pa.array([result.as_py()], type=result.type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d3968bf79227e..08bec301a4588 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -136,7 +136,7 @@ class ExtensionArray: _from_sequence_of_strings _hash_pandas_object _reduce - _reduce_with_wrap + _reduce_and_wrap _values_for_argsort _values_for_factorize @@ -185,6 +185,7 @@ class ExtensionArray: * _accumulate * _reduce + * _reduce_and_wrap One can implement methods to handle parsing from strings that will be used in methods such as ``pandas.io.parsers.read_csv``. @@ -1429,7 +1430,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): See Also -------- - ExtensionArray._reduce_with_wrap + ExtensionArray._reduce_and_wrap Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray. """ meth = getattr(self, name, None) @@ -1440,7 +1441,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ) return meth(skipna=skipna, **kwargs) - def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): """ Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray. @@ -1454,7 +1455,7 @@ def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): Examples -------- >>> arr = pd.array([1, 2, pd.NA]) - >>> arr._reduce_with_wrap("sum", kwargs={}) + >>> arr._reduce_and_wrap("sum", kwargs={}) [3] Length: 1, dtype: Int64 diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fef82e1bb96a2..2f3006e6259d1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2099,7 +2099,7 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: # ------------------------------------------------------------------ # Reductions - def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): result = self._reduce(name, skipna=skipna, **kwargs) return type(self)([result], dtype=self.dtype) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index a2d0997e0df01..5df261c6c10be 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1097,7 +1097,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): name, result, skipna=skipna, axis=axis, **kwargs ) - def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs): + def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): df = self.reshape(-1, 1) res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs) return res diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c988b0b157d67..9894ca0da4ec1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10848,7 +10848,7 @@ def blk_func(values, axis: Axis = 1): self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce_with_wrap(name, skipna=skipna, kwargs=kwds) + return values._reduce_and_wrap(name, skipna=skipna, kwargs=kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 27310dc9546e6..fdc981b0c0d09 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -56,15 +56,15 @@ def test_min_max_ordered(self, index_or_series_or_array): assert np.minimum.reduce(obj) == "d" assert np.maximum.reduce(obj) == "a" - def test_min_max_reduce_with_wrap(self): + def test_min_max_reduce_and_wrap(self): # GH52788 cat = Categorical(["a", "b", "c", "d"], ordered=True) - result_max = cat._reduce_with_wrap("max", kwargs={}) + result_max = cat._reduce_and_wrap("max", kwargs={}) expected_max = Categorical(["d"], dtype=cat.dtype) tm.assert_categorical_equal(result_max, expected_max) - result_min = cat._reduce_with_wrap("min", kwargs={}) + result_min = cat._reduce_and_wrap("min", kwargs={}) expected_min = Categorical(["a"], dtype=cat.dtype) tm.assert_categorical_equal(result_min, expected_min) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c55c10d192e68..d687eadbe228b 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -68,13 +68,13 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): self.check_reduce(s, op_name, skipna) @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_with_wrap(self, data, all_numeric_reductions, skipna): + def test_reduce_and_wrap(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) if not is_numeric_dtype(s): pytest.skip("not numeric dtype") - self.check_reduce_with_wrap(s, op_name, skipna) + self.check_reduce_and_wrap(s, op_name, skipna) class BaseBooleanReduceTests(BaseReduceTests): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a51a4a7dd2888..e69ea6e2954d6 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -116,7 +116,7 @@ def check_reduce(self, s, op_name, skipna): tm.assert_almost_equal(result, expected) @pytest.mark.skip("tests not written yet") - def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): pass diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 329478b4f8dd1..609e26404c7df 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -64,7 +64,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") @@ -87,7 +87,7 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): else: raise TypeError("not supposed to reach this") - result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) + result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 62d7558b43136..00a598cc81e9b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -508,7 +508,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) - def check_reduce_with_wrap(self, ser, op_name, skipna): + def check_reduce_and_wrap(self, ser, op_name, skipna): if op_name in ["count", "kurt", "sem", "skew"]: pytest.skip(f"{op_name} not an array method") @@ -530,7 +530,7 @@ def check_reduce_with_wrap(self, ser, op_name, skipna): "u": "uint64[pyarrow]", "f": "float64[pyarrow]", }[arr.dtype.kind] - result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs=kwargs) + result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs=kwargs) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index cb5ec2f785b1a..93b86c350253e 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -370,7 +370,7 @@ def check_reduce(self, s, op_name, skipna): expected = bool(expected) tm.assert_almost_equal(result, expected) - def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") @@ -386,7 +386,7 @@ def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): else: raise TypeError("not supposed to reach this") - result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={}) + result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) else: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index a512707a2b032..0997eda48225e 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -324,7 +324,7 @@ def check_reduce(self, s, op_name, skipna): tm.assert_almost_equal(result, expected) @pytest.mark.skip("tests not written yet") - def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): pass @pytest.mark.parametrize("skipna", [True, False]) From 5200896bae6c219829bae605488cd9c61fb3e096 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 2 May 2023 23:04:09 +0100 Subject: [PATCH 38/65] assert missing attributes --- pandas/tests/extension/test_arrow.py | 4 +++- pandas/tests/extension/test_boolean.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 00a598cc81e9b..0259c0ab2b9b7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -509,10 +509,12 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): super().test_reduce_series(data, all_numeric_reductions, skipna) def check_reduce_and_wrap(self, ser, op_name, skipna): + arr = ser.array + if op_name in ["count", "kurt", "sem", "skew"]: + assert not hasattr(arr, op_name) pytest.skip(f"{op_name} not an array method") - arr = ser.array kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} if op_name in ["max", "min"]: diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 93b86c350253e..b7422873c7f20 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -371,11 +371,12 @@ def check_reduce(self, s, op_name, skipna): tm.assert_almost_equal(result, expected) def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + arr = ser.array + if op_name in ["count", "kurt", "sem"]: + assert not hasattr(arr, op_name) pytest.skip(f"{op_name} not an array method") - arr = ser.array - if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["min", "max"]: From 26d4059e40d90989b22d8ab1b5d58511faa35dc7 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 3 May 2023 17:03:39 +0100 Subject: [PATCH 39/65] reduction dtypes on windows and 32bit systems --- pandas/core/arrays/masked.py | 17 ++++++++++++----- pandas/tests/extension/masked_shared.py | 4 ++-- pandas/tests/extension/test_boolean.py | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5df261c6c10be..3ce10a0e0b5f5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -34,6 +34,10 @@ Shape, npt, ) +from pandas.compat import ( + IS64, + is_platform_windows, +) from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -1124,12 +1128,15 @@ def _wrap_na_result(self, *, name, axis): float_dtype = "float32" if self.dtype == "Float32" else "float64" if name in ["mean", "median", "var", "std", "skew"]: np_dtype = float_dtype - elif name in ["min", "max"]: - # Incompatible types in assignment (expression has type "str", variable has - # type "Union[dtype[Any], ExtensionDtype]") - np_dtype = self.dtype.type # type: ignore[assignment] + elif name in ["min", "max"] or self.dtype.itemsize == 8: + np_dtype = self.dtype.numpy_dtype.name else: - np_dtype = {"i": "int64", "u": "uint64", "f": float_dtype}[self.dtype.kind] + is_windows_or_32bit = is_platform_windows() or not IS64 + int_dtype = "int32" if is_windows_or_32bit else "int64" + uint_dtype = "uint32" if is_windows_or_32bit else "uint64" + np_dtype = {"i": int_dtype, "u": uint_dtype, "f": float_dtype}[ + self.dtype.kind + ] value = np.array([1], dtype=np_dtype) return self._maybe_mask_result(value, mask=mask) diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 609e26404c7df..91177bc7bad35 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -81,9 +81,9 @@ def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if skipna and is_windows_or_32bit else "Int64" + cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if skipna and is_windows_or_32bit else "UInt64" + cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" else: raise TypeError("not supposed to reach this") diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index b7422873c7f20..01ff9fde865ea 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -383,7 +383,7 @@ def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: is_windows_or_32bit = is_platform_windows() or not IS64 - cmp_dtype = "Int32" if skipna and is_windows_or_32bit else "Int64" + cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" else: raise TypeError("not supposed to reach this") From b6bd75e657a648226f99571ef130b9f66b0eaec1 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 3 May 2023 14:02:50 +0100 Subject: [PATCH 40/65] add tests for min_count=0 --- pandas/tests/extension/base/dim2.py | 26 ++++++++++++++++---------- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/extension/test_boolean.py | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 38d454cfdbfc1..4cb4fa67b8926 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -190,7 +190,11 @@ def test_reductions_2d_axis_none(self, data, method): assert is_matching_na(result, expected) or result == expected @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis0(self, data, method): + @pytest.mark.parametrize("min_count", [0, 1]) + def test_reductions_2d_axis0(self, data, method, min_count): + if min_count == 1 and method not in ["sum", "prod"]: + pytest.skip(f"min_count not relevant for {method}") + arr2d = data.reshape(1, -1) kwargs = {} @@ -198,7 +202,7 @@ def test_reductions_2d_axis0(self, data, method): # pass ddof=0 so we get all-zero std instead of all-NA std kwargs["ddof"] = 0 elif method in ["prod", "sum"]: - kwargs["min_count"] = 1 + kwargs["min_count"] = min_count try: result = getattr(arr2d, method)(axis=0, **kwargs) @@ -222,20 +226,22 @@ def get_reduction_result_dtype(dtype): # i.e. dtype.kind == "u" return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] - if method in ["median", "sum", "prod"]: + if method in ["sum", "prod"]: # std and var are not dtype-preserving expected = data - if method in ["sum", "prod"] and data.dtype.kind in "iub": + if data.dtype.kind in "iub": dtype = get_reduction_result_dtype(data.dtype) - expected = data.astype(dtype) - if data.dtype.kind == "b" and method in ["sum", "prod"]: - # We get IntegerArray instead of BooleanArray - pass - else: - assert type(expected) == type(data), type(expected) assert dtype == expected.dtype + if min_count == 0: + fill_value = 1 if method == "prod" else 0 + expected = expected.fillna(fill_value) + + self.assert_extension_array_equal(result, expected) + elif method == "median": + # std and var are not dtype-preserving + expected = data self.assert_extension_array_equal(result, expected) elif method in ["mean", "std", "var"]: if is_integer_dtype(data) or is_bool_dtype(data): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0259c0ab2b9b7..43fc74dd5e1ec 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -513,7 +513,7 @@ def check_reduce_and_wrap(self, ser, op_name, skipna): if op_name in ["count", "kurt", "sem", "skew"]: assert not hasattr(arr, op_name) - pytest.skip(f"{op_name} not an array method") + return kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 01ff9fde865ea..4fdb15243bce0 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -375,7 +375,7 @@ def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: assert not hasattr(arr, op_name) - pytest.skip(f"{op_name} not an array method") + return if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" From 44dcdce0e6dba2dfbe37bc80ff6429286c12c01c Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 4 May 2023 07:00:47 +0100 Subject: [PATCH 41/65] PERF:median with axis=1 --- pandas/core/nanops.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b405447239a7a..97bb7bf4bcbf1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -803,7 +803,11 @@ def get_median(x, _mask=None): warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) - res = np.nanmedian(values, axis) + if 1 in values.shape: + # GH52788: nanmedian for 2D arrays can slow, this is a fastpath + res = np.nanmedian(np.squeeze(values), axis=axis, keepdims=True) + else: + res = np.nanmedian(values, axis=axis) else: # must return the correct shape, but median is not defined for the From 3ebcbffab40e5f0367720351f8efe3db83686d5b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 4 May 2023 08:17:10 +0100 Subject: [PATCH 42/65] median with axis=1 fix --- pandas/core/nanops.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 97bb7bf4bcbf1..20e4034a81ee3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -803,9 +803,11 @@ def get_median(x, _mask=None): warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) - if 1 in values.shape: - # GH52788: nanmedian for 2D arrays can slow, this is a fastpath - res = np.nanmedian(np.squeeze(values), axis=axis, keepdims=True) + if (values.shape[1] == 1 and axis == 0) or ( + values.shape[0] == 1 and axis == 1 + ): + # GH52788: fastpath when squeezable, nanmedian for 2D array slow + res = np.nanmedian(np.squeeze(values), keepdims=True) else: res = np.nanmedian(values, axis=axis) From 99d034ea227e2592512d6145dd883f9132713399 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 5 May 2023 23:22:47 +0100 Subject: [PATCH 43/65] streamline Block.reduce --- pandas/core/arrays/masked.py | 10 +++++----- pandas/core/internals/blocks.py | 3 --- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3ce10a0e0b5f5..872428854b418 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1256,7 +1256,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): "max", result, skipna=skipna, axis=axis, **kwargs ) - def any(self, *, skipna: bool = True, **kwargs): + def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether any element is truthy. @@ -1275,6 +1275,7 @@ def any(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be True if there is at least one element that is truthy, otherwise NA will be returned if there are NA's present. + axis : int, optional, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -1318,7 +1319,6 @@ def any(self, *, skipna: bool = True, **kwargs): >>> pd.array([0, 0, pd.NA]).any(skipna=False) """ - kwargs.pop("axis", None) nv.validate_any((), kwargs) values = self._data.copy() @@ -1337,7 +1337,7 @@ def any(self, *, skipna: bool = True, **kwargs): else: return self.dtype.na_value - def all(self, *, skipna: bool = True, **kwargs): + def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether all elements are truthy. @@ -1356,6 +1356,7 @@ def all(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be False if there is at least one element that is falsey, otherwise NA will be returned if there are NA's present. + axis : int, optional, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -1399,7 +1400,6 @@ def all(self, *, skipna: bool = True, **kwargs): >>> pd.array([1, 0, pd.NA]).all(skipna=False) False """ - kwargs.pop("axis", None) nv.validate_all((), kwargs) values = self._data.copy() @@ -1409,7 +1409,7 @@ def all(self, *, skipna: bool = True, **kwargs): # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] - result = values.all() + result = values.all(axis=axis) if skipna: return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 716432c7e78e2..9bf9f17172f2a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -341,9 +341,6 @@ def reduce(self, func) -> list[Block]: result = func(self.values) if self.values.ndim == 1: - # TODO(EA2D): special case not needed with 2D EAs - if not isinstance(result, (np.ndarray, ExtensionArray)): - result = np.array([[result]]) res_values = result else: res_values = result.reshape(-1, 1) From 79df9db09e4775834b6bc22902abaf2c1ad2ac2a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 6 May 2023 15:46:02 +0100 Subject: [PATCH 44/65] fix comments --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/arrays/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 036abb0a258ae..7db266d60df98 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -20,7 +20,7 @@ Reductions maintain extension dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions of pandas, the results of DataFrame reductions -(:meth:`DataFrameG.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames +(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe columns with a common dtype (:issue:`52788`). diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 08bec301a4588..f572bce02a07a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1461,7 +1461,7 @@ def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): Length: 1, dtype: Int64 """ result = self._reduce(name, skipna=skipna, **kwargs) - return np.array([[result]]) + return np.array([result]) # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 # Incompatible types in assignment (expression has type "None", base class From d01fc1daec2212248b308f257218758ac5b22413 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 May 2023 18:15:32 +0200 Subject: [PATCH 45/65] FIX preserve dtype with datetime columns of different resolution when merging (#53213) --- pandas/tests/reshape/merge/test_merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 896f1a9be52be..10bcb31444c67 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,6 +7,7 @@ import numpy as np import pytest +import pytz from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype From bc582f6fa9fd6938efbb2d4e596deab58384dfcc Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Tue, 16 May 2023 12:57:50 -0400 Subject: [PATCH 46/65] BUG Merge not behaving correctly when having `MultiIndex` with a single level (#53215) * fix merge when MultiIndex with single level * resolved conversations * fixed code style --- pandas/tests/reshape/merge/test_merge.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 10bcb31444c67..1b4cbe05162b4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2816,3 +2816,16 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +def test_merge_multiindex_single_level(): + # GH #52331 + df = DataFrame({"col": ["A", "B"]}) + df2 = DataFrame( + data={"b": [100]}, + index=MultiIndex.from_tuples([("A",), ("C",)], names=["col"]), + ) + expected = DataFrame({"col": ["A", "B"], "b": [100, np.nan]}) + + result = df.merge(df2, left_on=["col"], right_index=True, how="left") + tm.assert_frame_equal(result, expected) From a7fd1b1803a7a3628a152d7a79c308ce170ce386 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 May 2023 18:14:26 +0200 Subject: [PATCH 47/65] BUG: preserve dtype for right/outer merge of datetime with different resolutions (#53233) --- pandas/tests/reshape/merge/test_merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1b4cbe05162b4..f07b17e1b55ea 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,7 +7,6 @@ import numpy as np import pytest -import pytz from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype From 1781d306f36d86ae6ef1bef224ad7f2d2ccba5f3 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 22 May 2023 23:52:36 +0100 Subject: [PATCH 48/65] remove special BooleanArray.sum method --- pandas/core/arrays/boolean.py | 11 ----------- pandas/core/arrays/masked.py | 10 +++++----- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b4a5a38ffe608..e0d5b07ed8f1f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -406,17 +406,6 @@ def _accumulate( name, skipna=skipna, **kwargs ) - def sum( - self, - *, - skipna: bool = True, - min_count: int = 0, - axis: AxisInt | None = 0, - **kwargs, - ): - arr = cast(BaseMaskedArray, self.astype("Int8")) - return arr.sum(skipna=skipna, min_count=min_count, axis=axis, **kwargs) - def prod( self, *, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 872428854b418..db0be2bf8377f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1125,16 +1125,16 @@ def _wrap_na_result(self, *, name, axis): mask_size = self.shape[1] if axis == 0 else self.shape[0] mask = np.ones(mask_size, dtype=bool) - float_dtype = "float32" if self.dtype == "Float32" else "float64" + float_dtyp = "float32" if self.dtype == "Float32" else "float64" if name in ["mean", "median", "var", "std", "skew"]: - np_dtype = float_dtype + np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name else: is_windows_or_32bit = is_platform_windows() or not IS64 - int_dtype = "int32" if is_windows_or_32bit else "int64" - uint_dtype = "uint32" if is_windows_or_32bit else "uint64" - np_dtype = {"i": int_dtype, "u": uint_dtype, "f": float_dtype}[ + int_dtyp = "int32" if is_windows_or_32bit else "int64" + uint_dtyp = "uint32" if is_windows_or_32bit else "uint64" + np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[ self.dtype.kind ] From 68fd3167947ef394b87ba1e8f82e55b877951992 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 23 May 2023 16:46:04 +0100 Subject: [PATCH 49/65] remove BooleanArray.prod --- pandas/core/arrays/boolean.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e0d5b07ed8f1f..43344f04085ae 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -28,7 +28,6 @@ import pyarrow from pandas._typing import ( - AxisInt, Dtype, DtypeObj, Self, @@ -405,14 +404,3 @@ def _accumulate( return IntegerArray(data.astype(int), mask)._accumulate( name, skipna=skipna, **kwargs ) - - def prod( - self, - *, - skipna: bool = True, - min_count: int = 0, - axis: AxisInt | None = 0, - **kwargs, - ): - arr = cast(BaseMaskedArray, self.astype("Int8")) - return arr.prod(skipna=skipna, min_count=min_count, axis=axis, **kwargs) From 8ceb57d9657dcdc10aa367fe9096ba40ba4ba59f Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 08:15:21 +0100 Subject: [PATCH 50/65] fixes --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/reshape/merge/test_merge.py | 15 +-------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7db266d60df98..16ed64e4a0e23 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -24,7 +24,7 @@ In previous versions of pandas, the results of DataFrame reductions were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe columns with a common dtype (:issue:`52788`). -*New Behavior* +*Old Behavior* .. code-block:: ipython diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f07b17e1b55ea..0490ec4b21d79 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2805,20 +2805,7 @@ def test_merge_datetime_different_resolution(tz, how): def test_merge_multiindex_single_level(): - # GH #52331 - df = DataFrame({"col": ["A", "B"]}) - df2 = DataFrame( - data={"b": [100]}, - index=MultiIndex.from_tuples([("A",), ("C",)], names=["col"]), - ) - expected = DataFrame({"col": ["A", "B"], "b": [100, np.nan]}) - - result = df.merge(df2, left_on=["col"], right_index=True, how="left") - tm.assert_frame_equal(result, expected) - - -def test_merge_multiindex_single_level(): - # GH #52331 + # GH52331 df = DataFrame({"col": ["A", "B"]}) df2 = DataFrame( data={"b": [100]}, From 4375cb2560f1f5c102c3def3166189e7642767da Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 29 May 2023 11:27:03 +0100 Subject: [PATCH 51/65] Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 16ed64e4a0e23..021e8104745e4 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -16,8 +16,8 @@ Enhancements .. _whatsnew_210.enhancements.reduction_extension_dtypes: -Reductions maintain extension dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +DataFrame reductions preserve extension dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions of pandas, the results of DataFrame reductions (:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames From f7b354c60b7380c1cfbe483aea074dde1e10414b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 29 May 2023 20:31:06 +0100 Subject: [PATCH 52/65] Update pandas/core/array_algos/masked_reductions.py Co-authored-by: Joris Van den Bossche --- pandas/core/array_algos/masked_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 9ab4cb51cfdd8..c32d1dd499eea 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -124,8 +124,8 @@ def _minmax( subset = values[~mask] if subset.size: return func(values, where=~mask, axis=axis, initial=subset[0]) - # min/max with empty array raise in numpy, pandas returns NA else: + # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA From f91c6ca15e73260c6af8e4c2bf0bbf4457cfd292 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 30 May 2023 00:40:40 +0100 Subject: [PATCH 53/65] small cleanup --- pandas/core/arrays/masked.py | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index db0be2bf8377f..8269bf169fdae 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1097,17 +1097,14 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if np.isnan(result): result = libmissing.NA - return self._wrap_reduction_result( - name, result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): df = self.reshape(-1, 1) res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs) return res - def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): - axis = kwargs["axis"] + def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): if skipna: # we only retain mask for all-NA rows/columns @@ -1142,11 +1139,11 @@ def _wrap_na_result(self, *, name, axis): return self._maybe_mask_result(value, mask=mask) def _wrap_min_count_reduction_result( - self, name: str, result, skipna, min_count, **kwargs + self, name: str, result, *, skipna, min_count, axis ): if min_count == 0 and isinstance(result, np.ndarray): return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) - return self._wrap_reduction_result(name, result, skipna, **kwargs) + return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) def sum( self, @@ -1166,7 +1163,7 @@ def sum( axis=axis, ) return self._wrap_min_count_reduction_result( - "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs + "sum", result, skipna=skipna, min_count=min_count, axis=axis ) def prod( @@ -1187,7 +1184,7 @@ def prod( axis=axis, ) return self._wrap_min_count_reduction_result( - "prod", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs + "prod", result, skipna=skipna, min_count=min_count, axis=axis ) def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): @@ -1198,9 +1195,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): skipna=skipna, axis=axis, ) - return self._wrap_reduction_result( - "mean", result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result("mean", result, skipna=skipna, axis=axis) def var( self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs @@ -1213,9 +1208,7 @@ def var( axis=axis, ddof=ddof, ) - return self._wrap_reduction_result( - "var", result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result("var", result, skipna=skipna, axis=axis) def std( self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs @@ -1228,9 +1221,7 @@ def std( axis=axis, ddof=ddof, ) - return self._wrap_reduction_result( - "std", result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result("std", result, skipna=skipna, axis=axis) def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_min((), kwargs) @@ -1240,9 +1231,7 @@ def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): skipna=skipna, axis=axis, ) - return self._wrap_reduction_result( - "min", result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis) def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_max((), kwargs) @@ -1252,9 +1241,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): skipna=skipna, axis=axis, ) - return self._wrap_reduction_result( - "max", result, skipna=skipna, axis=axis, **kwargs - ) + return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ From 9a881fa79e6570d09cba4f2cd0f3be12df667dd6 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 30 May 2023 00:41:28 +0100 Subject: [PATCH 54/65] small cleanup --- pandas/core/arrays/masked.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8269bf169fdae..be0d642224972 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1104,13 +1104,14 @@ def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs) return res - def _wrap_reduction_result(self, name: str, result, *, skipna, axis): + def _wrap_reduction_result(self, name: str, result, *, mask=None, skipna, axis): if isinstance(result, np.ndarray): + mask = mask if mask is not None else self._mask if skipna: # we only retain mask for all-NA rows/columns - mask = self._mask.all(axis=axis) + mask = mask.all(axis=axis) else: - mask = self._mask.any(axis=axis) + mask = mask.any(axis=axis) return self._maybe_mask_result(result, mask) elif result is libmissing.NA and self.ndim == 2: From f603de048bbdd97b833920b636042a398a888006 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 31 May 2023 22:21:42 +0100 Subject: [PATCH 55/65] only reduce 1d --- pandas/core/arrays/masked.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index be0d642224972..10af3f35444b8 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1095,32 +1095,31 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): axis = kwargs.pop("axis", None) result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) if np.isnan(result): - result = libmissing.NA + return libmissing.NA - return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) + return result def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - df = self.reshape(-1, 1) - res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs) - return res + res = self._reduce(name=name, skipna=skipna, **kwargs) + if res is libmissing.NA: + return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) + else: + res = res.reshape(1) + mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(res, mask) - def _wrap_reduction_result(self, name: str, result, *, mask=None, skipna, axis): + def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): - mask = mask if mask is not None else self._mask if skipna: # we only retain mask for all-NA rows/columns - mask = mask.all(axis=axis) + mask = self._mask.all(axis=axis) else: - mask = mask.any(axis=axis) + mask = self._mask.any(axis=axis) return self._maybe_mask_result(result, mask) - elif result is libmissing.NA and self.ndim == 2: - result = self._wrap_na_result(name=name, axis=axis) - return result return result - def _wrap_na_result(self, *, name, axis): - mask_size = self.shape[1] if axis == 0 else self.shape[0] + def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" From 772998f64a17dd07eebd9751d4e7f8bf2e2c4e91 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 31 May 2023 23:17:48 +0100 Subject: [PATCH 56/65] fix after #53418 --- pandas/tests/frame/test_reductions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 08645ac1827ae..555d8f1b18797 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1681,9 +1681,9 @@ class TestEmptyDataFrameReductions: ("prod", np.int8, 1, np.int_), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), - ("sum", np.uint8, 0, np.int64), + ("sum", np.uint8, 0, np.uint64), ("prod", np.uint8, 1, np.uint), - ("sum", np.uint64, 0, np.int64), + ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), ("prod", np.float32, 1, np.float32), From 082ddd9c5dbeb4041eaedb9725252eb88add7b7d Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 3 Jun 2023 09:37:14 +0100 Subject: [PATCH 57/65] update according to comments --- doc/source/user_guide/integer_na.rst | 6 ----- pandas/tests/extension/base/reduce.py | 4 ++-- pandas/tests/extension/decimal/array.py | 4 ++++ .../tests/extension/decimal/test_decimal.py | 23 ++++++++++++++++--- pandas/tests/extension/masked_shared.py | 12 +++++++--- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/extension/test_boolean.py | 2 +- pandas/tests/extension/test_numpy.py | 2 +- 8 files changed, 38 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 539670444152b..1a727cd78af09 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -134,12 +134,6 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. df.sum() df.groupby("B").A.sum() -.. versionchanged:: 2.1.0 - - When doing reduction operations (:meth:`~DataFrame.sum` etc.) on numeric-only data - frames the integer array dtype will be maintained. Previously, the dtype of reduction - result would have been a numpy numeric dtype. - Scalar NA Value --------------- diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index d687eadbe228b..8f3c919cb0957 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -68,13 +68,13 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): self.check_reduce(s, op_name, skipna) @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_and_wrap(self, data, all_numeric_reductions, skipna): + def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) if not is_numeric_dtype(s): pytest.skip("not numeric dtype") - self.check_reduce_and_wrap(s, op_name, skipna) + self.check_reduce_frame(s, op_name, skipna) class BaseBooleanReduceTests(BaseReduceTests): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 3e495e9ac6814..147617b4f53b1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -253,6 +253,10 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ) from err return op(axis=0) + def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): + result = self._reduce(name, skipna=skipna, **kwargs) + return type(self)([result]) + def _cmp_method(self, other, op): # For use with OpsMixin def convert_values(param): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index e69ea6e2954d6..490326f9b45a9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -115,9 +115,26 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(np.asarray(s), op_name)() tm.assert_almost_equal(result, expected) - @pytest.mark.skip("tests not written yet") - def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): - pass + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + arr = ser.array + df = pd.DataFrame({"a": arr}) + + if op_name in ["count", "kurt", "sem", "skew", "median"]: + assert not hasattr(arr, op_name) + pytest.skip(f"{op_name} not an array method") + + result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) + result2 = getattr(df, op_name)(skipna=skipna).array + + tm.assert_extension_array_equal(result1, result2) + + if not skipna and ser.isna().any(): + expected = DecimalArray([pd.NA]) + else: + exp_value = getattr(ser.dropna(), op_name)() + expected = DecimalArray([exp_value]) + + tm.assert_extension_array_equal(result1, expected) class TestNumericReduce(Reduce, base.BaseNumericReduceTests): diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 91177bc7bad35..c49f83927d2fa 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -64,11 +64,13 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): if op_name in ["count", "kurt", "sem"]: + assert not hasattr(ser.array, op_name) pytest.skip(f"{op_name} not an array method") arr = ser.array + df = pd.DataFrame({"a": arr}) is_windows_or_32bit = is_platform_windows() or not IS64 @@ -87,13 +89,17 @@ def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): else: raise TypeError("not supposed to reach this") - result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) else: exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() expected = pd.array([exp_value], dtype=cmp_dtype) - tm.assert_extension_array_equal(result, expected) + + result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) + result2 = getattr(df, op_name)(skipna=skipna).array + + tm.assert_extension_array_equal(result1, result2) + tm.assert_extension_array_equal(result2, expected) class Accumulation(base.BaseAccumulateTests): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 03c5dba5f5776..5732b08a35db1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -509,7 +509,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) - def check_reduce_and_wrap(self, ser, op_name, skipna): + def check_reduce_frame(self, ser, op_name, skipna): arr = ser.array if op_name in ["count", "kurt", "sem", "skew"]: diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 4fdb15243bce0..c4062e8b7902a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -370,7 +370,7 @@ def check_reduce(self, s, op_name, skipna): expected = bool(expected) tm.assert_almost_equal(result, expected) - def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array if op_name in ["count", "kurt", "sem"]: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 0997eda48225e..0392597769930 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -324,7 +324,7 @@ def check_reduce(self, s, op_name, skipna): tm.assert_almost_equal(result, expected) @pytest.mark.skip("tests not written yet") - def check_reduce_and_wrap(self, ser: pd.Series, op_name: str, skipna: bool): + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): pass @pytest.mark.parametrize("skipna", [True, False]) From 803251472da8e4fd76da7d7ae832aed409f2694e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 3 Jun 2023 12:22:34 +0100 Subject: [PATCH 58/65] revome note --- doc/source/user_guide/pyarrow.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 5caf080c5ca2e..61b383afb7c43 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -152,12 +152,6 @@ The following are just some examples of operations that are accelerated by nativ ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type) ser_dt.dt.strftime("%Y-%m") -.. versionchanged:: 2.1.0 - - When doing :class:`DataFrame` reduction operations (:meth:`~DataFrame.sum` etc.) on - pyarrow data the dtype now will be maintained when possible. Previously, the dtype - of reduction result would have been a numpy numeric dtype. - I/O Reading ----------- From 3a3ec95078d42a947915ca3f26006b67da5d7003 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 5 Jun 2023 17:29:22 +0100 Subject: [PATCH 59/65] update _minmax --- pandas/core/array_algos/masked_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index c32d1dd499eea..335fa1afc0f4e 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -123,7 +123,7 @@ def _minmax( else: subset = values[~mask] if subset.size: - return func(values, where=~mask, axis=axis, initial=subset[0]) + return func(subset, axis=axis) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA From 49334c7a84f8506150dbbe8356ec86eb7f9eb269 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 29 Jun 2023 08:21:21 +0100 Subject: [PATCH 60/65] REF: add keepdims parameter to ExtensionArray._reduce + remove ExtensionArray._reduce_and_wrap --- doc/source/reference/extensions.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 2 + pandas/core/arrays/arrow/array.py | 22 ++++----- pandas/core/arrays/base.py | 46 +++++++------------ pandas/core/arrays/categorical.py | 11 +++-- pandas/core/arrays/masked.py | 41 +++++++++-------- pandas/core/arrays/sparse/array.py | 11 ++++- pandas/core/frame.py | 14 +++++- pandas/tests/extension/decimal/array.py | 41 +++++++++-------- .../tests/extension/decimal/test_decimal.py | 24 +++++++++- pandas/tests/extension/masked_shared.py | 2 +- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/extension/test_boolean.py | 2 +- 13 files changed, 127 insertions(+), 92 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index d7d98a24a6ba2..63eacc3f6d1d9 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -40,7 +40,6 @@ objects. api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object api.extensions.ExtensionArray._reduce - api.extensions.ExtensionArray._reduce_and_wrap api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize api.extensions.ExtensionArray.argsort diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index bf96078ca7fb7..de8d43234342e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -52,6 +52,8 @@ columns with a common dtype (:issue:`52788`). Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype. +To allow Dataframe reductions to preserve extension dtypes, :ref:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :ref:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :ref:`ExtensionArray._reduce`. + .. _whatsnew_210.enhancements.cow: Copy-on-Write improvements diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b22e081c91749..5b27f57a40840 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1512,7 +1512,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs): return result - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): """ Return a scalar result of performing the reduction operation. @@ -1536,18 +1538,16 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions """ - result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) - - if pc.is_null(result).as_py(): - return self.dtype.na_value + pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) - return result.as_py() + if keepdims: + result = pa.array([pa_result.as_py()], type=pa_result.type) + return type(self)(result) - def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray.""" - result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) - result = pa.array([result.as_py()], type=result.type) - return type(self)(result) + if pc.is_null(pa_result).as_py(): + return self.dtype.na_value + else: + return pa_result.as_py() def _explode(self): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6c1a1c0deb2d9..403474310d911 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -140,7 +140,6 @@ class ExtensionArray: _from_sequence_of_strings _hash_pandas_object _reduce - _reduce_and_wrap _values_for_argsort _values_for_factorize @@ -190,7 +189,6 @@ class ExtensionArray: * _accumulate * _reduce - * _reduce_and_wrap One can implement methods to handle parsing from strings that will be used in methods such as ``pandas.io.parsers.read_csv``. @@ -1437,7 +1435,9 @@ def _accumulate( """ raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): """ Return a scalar result of performing the reduction operation. @@ -1449,6 +1449,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. + keepdims : bool, default False + If False, a scalar is returned. + If True, the result has dimension with size one along the reduced axis. + + .. versionadded:: 2.1 + + This parameter is not required in the _reduce signature to keep backward + compatibility, but will become required in the future. If the parameter + is not found in the method signature, a FutureWarning will be emitted. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. @@ -1460,11 +1469,6 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): Raises ------ TypeError : subclass does not define reductions - - See Also - -------- - ExtensionArray._reduce_and_wrap - Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray. """ meth = getattr(self, name, None) if meth is None: @@ -1472,29 +1476,11 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): f"'{type(self).__name__}' with dtype {self.dtype} " f"does not support reduction '{name}'" ) - return meth(skipna=skipna, **kwargs) - - def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - """ - Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray. + result = meth(skipna=skipna, **kwargs) + if keepdims: + result = np.array([result]) - This is used to control the returned dtype when doing reductions in DataFrames, - and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``. - - Returns - ------- - ndarray or ExtensionArray - - Examples - -------- - >>> arr = pd.array([1, 2, pd.NA]) - >>> arr._reduce_and_wrap("sum", kwargs={}) - - [3] - Length: 1, dtype: Int64 - """ - result = self._reduce(name, skipna=skipna, **kwargs) - return np.array([result]) + return result # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 # Incompatible types in assignment (expression has type "None", base class diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 814f0412fd2e2..b138f0f1cb2ac 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2229,9 +2229,14 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: # ------------------------------------------------------------------ # Reductions - def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - result = self._reduce(name, skipna=skipna, **kwargs) - return type(self)([result], dtype=self.dtype) + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims: + return type(self)(result, dtype=self.dtype) + else: + return result def min(self, *, skipna: bool = True, **kwargs): """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 10af3f35444b8..a30bc848e392f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1083,30 +1083,31 @@ def _quantile( # ------------------------------------------------------------------ # Reductions - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: - return getattr(self, name)(skipna=skipna, **kwargs) - - data = self._data - mask = self._mask + result = getattr(self, name)(skipna=skipna, **kwargs) + else: + # median, skew, kurt, sem + data = self._data + mask = self._mask + op = getattr(nanops, f"nan{name}") + axis = kwargs.pop("axis", None) + result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) + + if keepdims: + if isna(result): + return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) + else: + result = result.reshape(1) + mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(result, mask) - # median, skew, kurt, sem - op = getattr(nanops, f"nan{name}") - axis = kwargs.pop("axis", None) - result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) - if np.isnan(result): + if isna(result): return libmissing.NA - - return result - - def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - res = self._reduce(name=name, skipna=skipna, **kwargs) - if res is libmissing.NA: - return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) else: - res = res.reshape(1) - mask = np.zeros(1, dtype=bool) - return self._maybe_mask_result(res, mask) + return result def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index aba6811c5eeb7..01b8affefda11 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1384,7 +1384,9 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]: # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): method = getattr(self, name, None) if method is None: @@ -1395,7 +1397,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): else: arr = self.dropna() - return getattr(arr, name)(**kwargs) + result = getattr(arr, name)(**kwargs) + + if keepdims: + return type(self)([result], dtype=self.dtype) + else: + return result def all(self, axis=None, *args, **kwargs): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fb47853244fec..6644ae64a62bf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13,6 +13,7 @@ import collections from collections import abc import functools +from inspect import signature from io import StringIO import itertools import operator @@ -10877,7 +10878,18 @@ def blk_func(values, axis: Axis = 1): self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce_and_wrap(name, skipna=skipna, kwargs=kwds) + sign = signature(values._reduce) + if "keepdims" in sign.parameters: + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + else: + warnings.warn( + f"{type(values)}._reduce will require a `keepdims` parameter " + "in the future", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = values._reduce(name, skipna=skipna, kwargs=kwds) + return np.array([result]) else: return op(values, axis=axis, skipna=skipna, **kwds) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 53d06f1e1e02a..fc579a50fef78 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -235,28 +235,29 @@ def _formatter(self, boxed=False): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if skipna: + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if skipna and self.isna().any(): # If we don't have any NAs, we can ignore skipna - if self.isna().any(): - other = self[~self.isna()] - return other._reduce(name, **kwargs) - - if name == "sum" and len(self) == 0: + other = self[~self.isna()] + result = other._reduce(name, **kwargs) + elif name == "sum" and len(self) == 0: # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy - return decimal.Decimal(0) - - try: - op = getattr(self.data, name) - except AttributeError as err: - raise NotImplementedError( - f"decimal does not support the {name} operation" - ) from err - return op(axis=0) - - def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): - result = self._reduce(name, skipna=skipna, **kwargs) - return type(self)([result]) + result = decimal.Decimal(0) + else: + try: + op = getattr(self.data, name) + except AttributeError as err: + raise NotImplementedError( + f"decimal does not support the {name} operation" + ) from err + result = op(axis=0) + + if keepdims: + return type(self)([result]) + else: + return result def _cmp_method(self, other, op): # For use with OpsMixin diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 490326f9b45a9..3f6b1ec8d20dd 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -123,7 +123,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): assert not hasattr(arr, op_name) pytest.skip(f"{op_name} not an array method") - result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) + result1 = arr._reduce(op_name, skipna=skipna, keepdims=True) result2 = getattr(df, op_name)(skipna=skipna).array tm.assert_extension_array_equal(result1, result2) @@ -136,6 +136,28 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_extension_array_equal(result1, expected) + def test_reduction_without_keepdims(self): + # GH52788 + # test _reduce without keepdims + + class DecimalArray2(DecimalArray): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + # no keepdims in signature + return super()._reduce(name, skipna=skipna) + + arr = DecimalArray2([decimal.Decimal(2) for _ in range(100)]) + + ser = pd.Series(arr) + result = ser.agg("sum") + expected = decimal.Decimal(200) + assert result == expected + + df = pd.DataFrame({"a": arr, "b": arr}) + with tm.assert_produces_warning(FutureWarning): + result = df.agg("sum") + expected = pd.Series({"a": 200, "b": 200}, dtype=object) + tm.assert_series_equal(result, expected) + class TestNumericReduce(Reduce, base.BaseNumericReduceTests): pass diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index c49f83927d2fa..ebbc14d27026c 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -95,7 +95,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() expected = pd.array([exp_value], dtype=cmp_dtype) - result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) + result1 = arr._reduce(op_name, skipna=skipna, keepdims=True) result2 = getattr(df, op_name)(skipna=skipna).array tm.assert_extension_array_equal(result1, result2) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6c893bbbdfcf9..dd2838d4b5212 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -533,7 +533,7 @@ def check_reduce_frame(self, ser, op_name, skipna): "u": "uint64[pyarrow]", "f": "float64[pyarrow]", }[arr.dtype.kind] - result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs=kwargs) + result = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index c4062e8b7902a..63ae2b629e549 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -387,7 +387,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): else: raise TypeError("not supposed to reach this") - result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={}) + result = arr._reduce(op_name, skipna=skipna, keepdims=True) if not skipna and ser.isna().any(): expected = pd.array([pd.NA], dtype=cmp_dtype) else: From 5634106bd1f32b7da189199f522ee962ade37aa8 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 29 Jun 2023 08:21:21 +0100 Subject: [PATCH 61/65] REF: add keepdims parameter to ExtensionArray._reduce + remove ExtensionArray._reduce_and_wrap --- .../tests/arrays/categorical/test_analytics.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index fdc981b0c0d09..c42364d4d4377 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -9,6 +9,7 @@ from pandas import ( Categorical, CategoricalDtype, + DataFrame, Index, NaT, Series, @@ -56,17 +57,18 @@ def test_min_max_ordered(self, index_or_series_or_array): assert np.minimum.reduce(obj) == "d" assert np.maximum.reduce(obj) == "a" - def test_min_max_reduce_and_wrap(self): + def test_min_max_reduce(self): # GH52788 cat = Categorical(["a", "b", "c", "d"], ordered=True) + df = DataFrame(cat) - result_max = cat._reduce_and_wrap("max", kwargs={}) - expected_max = Categorical(["d"], dtype=cat.dtype) - tm.assert_categorical_equal(result_max, expected_max) + result_max = df.agg("max") + expected_max = Series(Categorical(["d"], dtype=cat.dtype)) + tm.assert_series_equal(result_max, expected_max) - result_min = cat._reduce_and_wrap("min", kwargs={}) - expected_min = Categorical(["a"], dtype=cat.dtype) - tm.assert_categorical_equal(result_min, expected_min) + result_min = df.agg("min") + expected_min = Series(Categorical(["a"], dtype=cat.dtype)) + tm.assert_series_equal(result_min, expected_min) @pytest.mark.parametrize( "categories,expected", From f85deab32d337775f6552d092f6bf3054fb60b88 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 29 Jun 2023 11:36:55 +0100 Subject: [PATCH 62/65] fix whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index de8d43234342e..bef646c765463 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -52,7 +52,7 @@ columns with a common dtype (:issue:`52788`). Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype. -To allow Dataframe reductions to preserve extension dtypes, :ref:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :ref:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :ref:`ExtensionArray._reduce`. +To allow Dataframe reductions to preserve extension dtypes, :meth:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :meth:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :meth:`ExtensionArray._reduce`. .. _whatsnew_210.enhancements.cow: From 65197127cd943f6bc1c9ff5c27a3798855a05818 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 29 Jun 2023 16:49:08 +0100 Subject: [PATCH 63/65] fix _reduce call --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6644ae64a62bf..e5ff9c1247a2d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10888,7 +10888,7 @@ def blk_func(values, axis: Axis = 1): FutureWarning, stacklevel=find_stack_level(), ) - result = values._reduce(name, skipna=skipna, kwargs=kwds) + result = values._reduce(name, skipna=skipna, **kwds) return np.array([result]) else: return op(values, axis=axis, skipna=skipna, **kwds) From e3afa181949125f5155ba332059e4961b3c3a70d Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 12 Jul 2023 23:18:35 +0100 Subject: [PATCH 64/65] simplify test --- pandas/tests/arrays/integer/test_reduction.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index f4e0706c30175..a5167387a7e18 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -20,10 +20,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op in {"sum", "prod", "min", "max"}: - assert isinstance(result, np.int64) - else: - assert isinstance(result, int) + assert isinstance(result, np.int64) # groupby result = getattr(df.groupby("A"), op)() From 899a2fb4483f23a9b0cda2a195a8eb7f4750183c Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 13 Jul 2023 11:50:28 +0100 Subject: [PATCH 65/65] add tests for any/all --- pandas/core/frame.py | 4 +- pandas/tests/arrays/integer/test_reduction.py | 127 ++++++++++++++---- 2 files changed, 105 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e48a78973ab1..8beaf0eff5301 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10946,9 +10946,9 @@ def _get_data() -> DataFrame: # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) out = df._constructor_from_mgr(res, axes=res.axes).iloc[0] - if out_dtype is not None: + if out_dtype is not None and out.dtype != "boolean": out = out.astype(out_dtype) - elif (df._mgr.get_dtypes() == object).any(): + elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]: out = out.astype(object) elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): # Even if we are object dtype, follow numpy and return diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index a5167387a7e18..5326a8cb0356b 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -2,40 +2,119 @@ import pytest import pandas as pd +from pandas import ( + DataFrame, + Series, + array, +) import pandas._testing as tm -@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) -def test_preserve_dtypes(op): - # TODO(#22346): preserve Int64 dtype - # for ops that enable (mean would actually work here - # but generally it is a float return value) - df = pd.DataFrame( +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", np.int64(3)], + ["prod", np.int64(2)], + ["min", np.int64(1)], + ["max", np.int64(2)], + ["mean", np.float64(1.5)], + ["median", np.float64(1.5)], + ["var", np.float64(0.5)], + ["std", np.float64(0.5**0.5)], + ["skew", pd.NA], + ["any", True], + ["all", True], + ], +) +def test_series_reductions(op, expected): + ser = Series([1, 2], dtype="Int64") + result = getattr(ser, op)() + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([3], index=["a"], dtype="Int64")], + ["prod", Series([2], index=["a"], dtype="Int64")], + ["min", Series([1], index=["a"], dtype="Int64")], + ["max", Series([2], index=["a"], dtype="Int64")], + ["mean", Series([1.5], index=["a"], dtype="Float64")], + ["median", Series([1.5], index=["a"], dtype="Float64")], + ["var", Series([0.5], index=["a"], dtype="Float64")], + ["std", Series([0.5**0.5], index=["a"], dtype="Float64")], + ["skew", Series([pd.NA], index=["a"], dtype="Float64")], + ["any", Series([True], index=["a"], dtype="boolean")], + ["all", Series([True], index=["a"], dtype="boolean")], + ], +) +def test_dataframe_reductions(op, expected): + df = DataFrame({"a": array([1, 2], dtype="Int64")}) + result = getattr(df, op)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", array([1, 3], dtype="Int64")], + ["prod", array([1, 3], dtype="Int64")], + ["min", array([1, 3], dtype="Int64")], + ["max", array([1, 3], dtype="Int64")], + ["mean", array([1, 3], dtype="Float64")], + ["median", array([1, 3], dtype="Float64")], + ["var", array([pd.NA], dtype="Float64")], + ["std", array([pd.NA], dtype="Float64")], + ["skew", array([pd.NA], dtype="Float64")], + ["any", array([True, True], dtype="boolean")], + ["all", array([True, True], dtype="boolean")], + ], +) +def test_groupby_reductions(op, expected): + df = DataFrame( { "A": ["a", "b", "b"], - "B": [1, None, 3], - "C": pd.array([1, None, 3], dtype="Int64"), + "B": array([1, None, 3], dtype="Int64"), } ) + result = getattr(df.groupby("A"), op)() + expected = DataFrame(expected, index=pd.Index(["a", "b"], name="A"), columns=["B"]) - # op - result = getattr(df.C, op)() - assert isinstance(result, np.int64) + tm.assert_frame_equal(result, expected) - # groupby - result = getattr(df.groupby("A"), op)() - expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")}, - index=pd.Index(["a", "b"], name="A"), +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([4, 4], index=["B", "C"], dtype="Float64")], + ["prod", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["min", Series([1, 1], index=["B", "C"], dtype="Float64")], + ["max", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["mean", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["median", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["var", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")], + ["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ], +) +def test_mixed_reductions(op, expected): + df = DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": array([1, None, 3], dtype="Int64"), + } ) - tm.assert_frame_equal(result, expected) + # series + result = getattr(df.C, op)() + tm.assert_equal(result, expected["C"]) -@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) -def test_dataframe_reductions(op): - # https://github.com/pandas-dev/pandas/pull/32867 - # ensure the integers are not cast to float during reductions - df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) - result = getattr(df, op)() - assert isinstance(result["a"], np.int64) + # frame + if op in ["any", "all"]: + result = getattr(df, op)() + else: + result = getattr(df, op)(numeric_only=True) + tm.assert_series_equal(result, expected)