REF: add keepdims parameter to ExtensionArray._reduce + remove ExtensionArray._reduce_and_wrap

topper-123 · topper-123 · commit 49334c7a84f8 · 2023-06-29T09:39:13.000+01:00
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -40,7 +40,6 @@ objects.
       api.extensions.ExtensionArray._from_sequence_of_strings
       api.extensions.ExtensionArray._hash_pandas_object
       api.extensions.ExtensionArray._reduce
-      api.extensions.ExtensionArray._reduce_and_wrap
       api.extensions.ExtensionArray._values_for_argsort
       api.extensions.ExtensionArray._values_for_factorize
       api.extensions.ExtensionArray.argsort
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -52,6 +52,8 @@ columns with a common dtype (:issue:`52788`).
 
 Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype.
 
+To allow Dataframe reductions to preserve extension dtypes, :ref:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :ref:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :ref:`ExtensionArray._reduce`.
+
 .. _whatsnew_210.enhancements.cow:
 
 Copy-on-Write improvements
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1512,7 +1512,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
 
         return result
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -1536,18 +1538,16 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         ------
         TypeError : subclass does not define reductions
         """
-        result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
-
-        if pc.is_null(result).as_py():
-            return self.dtype.na_value
+        pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
 
-        return result.as_py()
+        if keepdims:
+            result = pa.array([pa_result.as_py()], type=pa_result.type)
+            return type(self)(result)
 
-    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray."""
-        result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
-        result = pa.array([result.as_py()], type=result.type)
-        return type(self)(result)
+        if pc.is_null(pa_result).as_py():
+            return self.dtype.na_value
+        else:
+            return pa_result.as_py()
 
     def _explode(self):
         """
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -140,7 +140,6 @@ class ExtensionArray:
     _from_sequence_of_strings
     _hash_pandas_object
     _reduce
-    _reduce_and_wrap
     _values_for_argsort
     _values_for_factorize
 
@@ -190,7 +189,6 @@ class ExtensionArray:
 
     * _accumulate
     * _reduce
-    * _reduce_and_wrap
 
     One can implement methods to handle parsing from strings that will be used
     in methods such as ``pandas.io.parsers.read_csv``.
@@ -1437,7 +1435,9 @@ def _accumulate(
         """
         raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -1449,6 +1449,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
             std, var, sem, kurt, skew }.
         skipna : bool, default True
             If True, skip NaN values.
+        keepdims : bool, default False
+            If False, a scalar is returned.
+            If True, the result has dimension with size one along the reduced axis.
+
+            .. versionadded:: 2.1
+
+               This parameter is not required in the _reduce signature to keep backward
+               compatibility, but will become required in the future. If the parameter
+               is not found in the method signature, a FutureWarning will be emitted.
         **kwargs
             Additional keyword arguments passed to the reduction function.
             Currently, `ddof` is the only supported kwarg.
@@ -1460,41 +1469,18 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         Raises
         ------
         TypeError : subclass does not define reductions
-
-        See Also
-        --------
-        ExtensionArray._reduce_and_wrap
-            Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray.
         """
         meth = getattr(self, name, None)
         if meth is None:
             raise TypeError(
                 f"'{type(self).__name__}' with dtype {self.dtype} "
                 f"does not support reduction '{name}'"
             )
-        return meth(skipna=skipna, **kwargs)
-
-    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        """
-        Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray.
+        result = meth(skipna=skipna, **kwargs)
+        if keepdims:
+            result = np.array([result])
 
-        This is used to control the returned dtype when doing reductions in DataFrames,
-        and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``.
-
-        Returns
-        -------
-        ndarray or ExtensionArray
-
-        Examples
-        --------
-        >>> arr = pd.array([1, 2, pd.NA])
-        >>> arr._reduce_and_wrap("sum", kwargs={})
-        <IntegerArray>
-        [3]
-        Length: 1, dtype: Int64
-        """
-        result = self._reduce(name, skipna=skipna, **kwargs)
-        return np.array([result])
+        return result
 
     # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
     # Incompatible types in assignment (expression has type "None", base class
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2229,9 +2229,14 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     # ------------------------------------------------------------------
     # Reductions
 
-    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        result = self._reduce(name, skipna=skipna, **kwargs)
-        return type(self)([result], dtype=self.dtype)
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        if keepdims:
+            return type(self)(result, dtype=self.dtype)
+        else:
+            return result
 
     def min(self, *, skipna: bool = True, **kwargs):
         """
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -1083,30 +1083,31 @@ def _quantile(
     # ------------------------------------------------------------------
     # Reductions
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
-            return getattr(self, name)(skipna=skipna, **kwargs)
-
-        data = self._data
-        mask = self._mask
+            result = getattr(self, name)(skipna=skipna, **kwargs)
+        else:
+            # median, skew, kurt, sem
+            data = self._data
+            mask = self._mask
+            op = getattr(nanops, f"nan{name}")
+            axis = kwargs.pop("axis", None)
+            result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
+
+        if keepdims:
+            if isna(result):
+                return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
+            else:
+                result = result.reshape(1)
+                mask = np.zeros(1, dtype=bool)
+                return self._maybe_mask_result(result, mask)
 
-        # median, skew, kurt, sem
-        op = getattr(nanops, f"nan{name}")
-        axis = kwargs.pop("axis", None)
-        result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
-        if np.isnan(result):
+        if isna(result):
             return libmissing.NA
-
-        return result
-
-    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        res = self._reduce(name=name, skipna=skipna, **kwargs)
-        if res is libmissing.NA:
-            return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
         else:
-            res = res.reshape(1)
-            mask = np.zeros(1, dtype=bool)
-            return self._maybe_mask_result(res, mask)
+            return result
 
     def _wrap_reduction_result(self, name: str, result, *, skipna, axis):
         if isinstance(result, np.ndarray):
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1384,7 +1384,9 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
     # Reductions
     # ------------------------------------------------------------------------
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         method = getattr(self, name, None)
 
         if method is None:
@@ -1395,7 +1397,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         else:
             arr = self.dropna()
 
-        return getattr(arr, name)(**kwargs)
+        result = getattr(arr, name)(**kwargs)
+
+        if keepdims:
+            return type(self)([result], dtype=self.dtype)
+        else:
+            return result
 
     def all(self, axis=None, *args, **kwargs):
         """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -13,6 +13,7 @@
 import collections
 from collections import abc
 import functools
+from inspect import signature
 from io import StringIO
 import itertools
 import operator
@@ -10877,7 +10878,18 @@ def blk_func(values, axis: Axis = 1):
                     self._mgr, ArrayManager
                 ):
                     return values._reduce(name, axis=1, skipna=skipna, **kwds)
-                return values._reduce_and_wrap(name, skipna=skipna, kwargs=kwds)
+                sign = signature(values._reduce)
+                if "keepdims" in sign.parameters:
+                    return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
+                else:
+                    warnings.warn(
+                        f"{type(values)}._reduce will require a `keepdims` parameter "
+                        "in the future",
+                        FutureWarning,
+                        stacklevel=find_stack_level(),
+                    )
+                    result = values._reduce(name, skipna=skipna, kwargs=kwds)
+                    return np.array([result])
             else:
                 return op(values, axis=axis, skipna=skipna, **kwds)
 
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -235,28 +235,29 @@ def _formatter(self, boxed=False):
     def _concat_same_type(cls, to_concat):
         return cls(np.concatenate([x._data for x in to_concat]))
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
-        if skipna:
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        if skipna and self.isna().any():
             # If we don't have any NAs, we can ignore skipna
-            if self.isna().any():
-                other = self[~self.isna()]
-                return other._reduce(name, **kwargs)
-
-        if name == "sum" and len(self) == 0:
+            other = self[~self.isna()]
+            result = other._reduce(name, **kwargs)
+        elif name == "sum" and len(self) == 0:
             # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy
-            return decimal.Decimal(0)
-
-        try:
-            op = getattr(self.data, name)
-        except AttributeError as err:
-            raise NotImplementedError(
-                f"decimal does not support the {name} operation"
-            ) from err
-        return op(axis=0)
-
-    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        result = self._reduce(name, skipna=skipna, **kwargs)
-        return type(self)([result])
+            result = decimal.Decimal(0)
+        else:
+            try:
+                op = getattr(self.data, name)
+            except AttributeError as err:
+                raise NotImplementedError(
+                    f"decimal does not support the {name} operation"
+                ) from err
+            result = op(axis=0)
+
+        if keepdims:
+            return type(self)([result])
+        else:
+            return result
 
     def _cmp_method(self, other, op):
         # For use with OpsMixin
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -123,7 +123,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
             assert not hasattr(arr, op_name)
             pytest.skip(f"{op_name} not an array method")
 
-        result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={})
+        result1 = arr._reduce(op_name, skipna=skipna, keepdims=True)
         result2 = getattr(df, op_name)(skipna=skipna).array
 
         tm.assert_extension_array_equal(result1, result2)
@@ -136,6 +136,28 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
 
         tm.assert_extension_array_equal(result1, expected)
 
+    def test_reduction_without_keepdims(self):
+        # GH52788
+        # test _reduce without keepdims
+
+        class DecimalArray2(DecimalArray):
+            def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+                # no keepdims in signature
+                return super()._reduce(name, skipna=skipna)
+
+        arr = DecimalArray2([decimal.Decimal(2) for _ in range(100)])
+
+        ser = pd.Series(arr)
+        result = ser.agg("sum")
+        expected = decimal.Decimal(200)
+        assert result == expected
+
+        df = pd.DataFrame({"a": arr, "b": arr})
+        with tm.assert_produces_warning(FutureWarning):
+            result = df.agg("sum")
+        expected = pd.Series({"a": 200, "b": 200}, dtype=object)
+        tm.assert_series_equal(result, expected)
+
 
 class TestNumericReduce(Reduce, base.BaseNumericReduceTests):
     pass
diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py
@@ -95,7 +95,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
             exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)()
             expected = pd.array([exp_value], dtype=cmp_dtype)
 
-        result1 = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={})
+        result1 = arr._reduce(op_name, skipna=skipna, keepdims=True)
         result2 = getattr(df, op_name)(skipna=skipna).array
 
         tm.assert_extension_array_equal(result1, result2)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -533,7 +533,7 @@ def check_reduce_frame(self, ser, op_name, skipna):
                 "u": "uint64[pyarrow]",
                 "f": "float64[pyarrow]",
             }[arr.dtype.kind]
-        result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs=kwargs)
+        result = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs)
 
         if not skipna and ser.isna().any():
             expected = pd.array([pd.NA], dtype=cmp_dtype)
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -387,7 +387,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
         else:
             raise TypeError("not supposed to reach this")
 
-        result = arr._reduce_and_wrap(op_name, skipna=skipna, kwargs={})
+        result = arr._reduce(op_name, skipna=skipna, keepdims=True)
         if not skipna and ser.isna().any():
             expected = pd.array([pd.NA], dtype=cmp_dtype)
         else: