ENH: better dtype inference when doing DataFrame reductions

topper-123 · topper-123 · commit ef89a57f1eb1 · 2023-04-19T18:34:25.000+01:00
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1213,7 +1213,9 @@ def _accumulate(
 
         return type(self)(result)
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -1310,6 +1312,12 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
         if name == "median":
             # GH 52679: Use quantile instead of approximate_median; returns array
             result = result[0]
+
+        if keepdims:
+            # TODO: is there a way to do this without .as_py()
+            result = pa.array([result.as_py()], type=result.type)
+            return type(self)(result)
+
         if pc.is_null(result).as_py():
             return self.dtype.na_value
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1403,7 +1403,9 @@ def _accumulate(
         """
         raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -1433,7 +1435,14 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
                 f"'{type(self).__name__}' with dtype {self.dtype} "
                 f"does not support reduction '{name}'"
             )
-        return meth(skipna=skipna, **kwargs)
+        result = meth(skipna=skipna, **kwargs)
+
+        if keepdims:
+            # if subclasses want to avoid wrapping in np.array, do:
+            # super()._reduce(..., keepdims=False) and wrap that.
+            return np.array([[result]])
+        else:
+            return result
 
     # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
     # Incompatible types in assignment (expression has type "None", base class
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -42,10 +42,13 @@
 from pandas.core.dtypes.common import (
     is_bool,
     is_dtype_equal,
+    is_float_dtype,
     is_integer_dtype,
     is_list_like,
     is_scalar,
     is_string_dtype,
+    is_unsigned_integer_dtype,
+    is_signed_integer_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import BaseMaskedDtype
@@ -1069,7 +1072,15 @@ def _quantile(
     # ------------------------------------------------------------------
     # Reductions
 
-    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        if keepdims:
+            res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs)
+            if res is libmissing.NA:
+                res = self._wrap_na_result(name)
+            return res
+
         if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
             return getattr(self, name)(skipna=skipna, **kwargs)
 
@@ -1097,6 +1108,30 @@ def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
             return self._maybe_mask_result(result, mask)
         return result
 
+    def _wrap_min_count_reduction_result(
+        self, name: str, result, skipna, min_count, **kwargs
+    ):
+        if min_count == 0 and isinstance(result, np.ndarray):
+            return self._maybe_mask_result(result, np.zeros(1, dtype=bool))
+        return self._wrap_reduction_result(name, result, skipna, **kwargs)
+
+    def _wrap_na_result(self, name):
+        mask = np.ones(1, dtype=bool)
+
+        if is_float_dtype(self.dtype):
+            np_dtype = np.float64
+        elif name in ["mean", "median", "var", "std", "skew"]:
+            np_dtype = np.float64
+        elif is_signed_integer_dtype(self.dtype):
+            np_dtype = np.int64
+        elif is_unsigned_integer_dtype(self.dtype):
+            np_dtype = np.uint64
+        else:
+            raise TypeError(self.dtype)
+
+        value = np.array([1], dtype=np_dtype)
+        return self._maybe_mask_result(value, mask=mask)
+
     def sum(
         self,
         *,
@@ -1114,8 +1149,8 @@ def sum(
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result(
-            "sum", result, skipna=skipna, axis=axis, **kwargs
+        return self._wrap_min_count_reduction_result(
+            "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs
         )
 
     def prod(
@@ -1134,8 +1169,8 @@ def prod(
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result(
-            "prod", result, skipna=skipna, axis=axis, **kwargs
+        return self._wrap_min_count_reduction_result(
+            "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs
         )
 
     def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10858,7 +10858,7 @@ def blk_func(values, axis: Axis = 1):
                     self._mgr, ArrayManager
                 ):
                     return values._reduce(name, axis=1, skipna=skipna, **kwds)
-                return values._reduce(name, skipna=skipna, **kwds)
+                return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
             else:
                 return op(values, axis=axis, skipna=skipna, **kwds)
 
@@ -10903,7 +10903,7 @@ def _get_data() -> DataFrame:
             out = out.astype(out_dtype)
         elif (df._mgr.get_dtypes() == object).any():
             out = out.astype(object)
-        elif len(self) == 0 and name in ("sum", "prod"):
+        elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):
             # Even if we are object dtype, follow numpy and return
             #  float64, see test_apply_funcs_over_empty
             out = out.astype(np.float64)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -340,7 +340,10 @@ def reduce(self, func) -> list[Block]:
 
         if self.values.ndim == 1:
             # TODO(EA2D): special case not needed with 2D EAs
-            res_values = np.array([[result]])
+            if isinstance(result, (np.ndarray, ExtensionArray)):
+                res_values = result
+            else:
+                res_values = np.array([[result]])
         else:
             res_values = result.reshape(-1, 1)
 
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -6,7 +6,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import is_platform_windows
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -29,6 +32,8 @@
     nanops,
 )
 
+is_windows_or_is32 = is_platform_windows() or not IS64
+
 
 def assert_stat_op_calc(
     opname,
@@ -917,7 +922,7 @@ def test_mean_extensionarray_numeric_only_true(self):
         arr = np.random.randint(1000, size=(10, 5))
         df = DataFrame(arr, dtype="Int64")
         result = df.mean(numeric_only=True)
-        expected = DataFrame(arr).mean()
+        expected = DataFrame(arr).mean().astype("Float64")
         tm.assert_series_equal(result, expected)
 
     def test_stats_mixed_type(self, float_string_frame):
@@ -1626,6 +1631,101 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
             getattr(np, method)(df, axis=0)
 
 
+class TestEmptyDataFrameReductions:
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", np.int8, 0, np.int64),
+            ("prod", np.int8, 1, np.int_),
+            ("sum", np.int64, 0, np.int64),
+            ("prod", np.int64, 1, np.int64),
+            ("sum", np.uint8, 0, np.int64),
+            ("prod", np.uint8, 1, np.uint),
+            ("sum", np.uint64, 0, np.int64),
+            ("prod", np.uint64, 1, np.uint64),
+            ("sum", np.float32, 0, np.float32),
+            ("prod", np.float32, 1, np.float32),
+            ("sum", np.float64, 0, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", np.int8, np.float64),
+            ("prod", np.int8, np.float64),
+            ("sum", np.int64, np.float64),
+            ("prod", np.int64, np.float64),
+            ("sum", np.uint8, np.float64),
+            ("prod", np.uint8, np.float64),
+            ("sum", np.uint64, np.float64),
+            ("prod", np.uint64, np.float64),
+            ("sum", np.float32, np.float32),
+            ("prod", np.float32, np.float32),
+            ("sum", np.float64, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([np.nan, np.nan], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")),
+            ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")),
+            ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")),
+            ("sum", "Int64", 0, "Int64"),
+            ("prod", "Int64", 1, "Int64"),
+            ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("sum", "UInt64", 0, "UInt64"),
+            ("prod", "UInt64", 1, "UInt64"),
+            ("sum", "Float32", 0, "Float32"),
+            ("prod", "Float32", 1, "Float32"),
+            ("sum", "Float64", 0, "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", "Int8", "Int64"),
+            ("prod", "Int8", "Int64"),
+            ("sum", "Int64", "Int64"),
+            ("prod", "Int64", "Int64"),
+            ("sum", "UInt8", "UInt64"),
+            ("prod", "UInt8", "UInt64"),
+            ("sum", "UInt64", "UInt64"),
+            ("prod", "UInt64", "UInt64"),
+            ("sum", "Float32", "Float32"),
+            ("prod", "Float32", "Float32"),
+            ("sum", "Float64", "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([pd.NA, pd.NA], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+
 def test_sum_timedelta64_skipna_false(using_array_manager, request):
     # GH#17235
     if using_array_manager: