BUG: DataFrame reductions dtypes on object input (#51335)

rhshadrach · web-flow · commit b836a88f81c5 · 2023-02-17T17:14:54.000-08:00
* BUG: DataFrame reductions dtypes

* whatsnew

* dtype fixup; whatsnew

* Add test, fix whatsnew

* Add datetime test

* result_dtype.type

* xfail

* type-ignore
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -814,7 +814,7 @@ Other API changes
 - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`)
 - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
 - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`)
-- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`)
+- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
 - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
 -
 
@@ -1226,11 +1226,11 @@ Numeric
 ^^^^^^^
 - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`)
 - Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`)
-- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
 - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)
 - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`)
 - Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
 - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
+- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -293,6 +293,14 @@ def ordered(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def skipna(request):
+    """
+    Boolean 'skipna' parameter.
+    """
+    return request.param
+
+
 @pytest.fixture(params=["first", "last", False])
 def keep(request):
     """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -141,7 +141,6 @@
     is_integer_dtype,
     is_iterator,
     is_list_like,
-    is_object_dtype,
     is_scalar,
     is_sequence,
     needs_i8_conversion,
@@ -10458,54 +10457,44 @@ def _get_data() -> DataFrame:
                 data = self._get_bool_data()
             return data
 
-        if numeric_only or axis == 0:
-            # For numeric_only non-None and axis non-None, we know
-            #  which blocks to use and no try/except is needed.
-            #  For numeric_only=None only the case with axis==0 and no object
-            #  dtypes are unambiguous can be handled with BlockManager.reduce
-            # Case with EAs see GH#35881
-            df = self
-            if numeric_only:
-                df = _get_data()
-            if axis == 1:
-                df = df.T
-                axis = 0
-
-            # After possibly _get_data and transposing, we are now in the
-            #  simple case where we can use BlockManager.reduce
-            res = df._mgr.reduce(blk_func)
-            out = df._constructor(res).iloc[0]
-            if out_dtype is not None:
-                out = out.astype(out_dtype)
-            if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
-                # Even if we are object dtype, follow numpy and return
-                #  float64, see test_apply_funcs_over_empty
-                out = out.astype(np.float64)
-
-            return out
-
-        assert not numeric_only and axis in (1, None)
-
-        data = self
-        values = data.values
-        result = func(values)
-
-        if hasattr(result, "dtype"):
-            if filter_type == "bool" and notna(result).all():
-                result = result.astype(np.bool_)
-            elif filter_type is None and is_object_dtype(result.dtype):
-                try:
-                    result = result.astype(np.float64)
-                except (ValueError, TypeError):
-                    # try to coerce to the original dtypes item by item if we can
-                    pass
-
+        # Case with EAs see GH#35881
+        df = self
+        if numeric_only:
+            df = _get_data()
         if axis is None:
-            return result
+            return func(df.values)
+        elif axis == 1:
+            if len(df.index) == 0:
+                # Taking a transpose would result in no columns, losing the dtype.
+                # In the empty case, reducing along axis 0 or 1 gives the same
+                # result dtype, so reduce with axis=0 and ignore values
+                result = df._reduce(
+                    op,
+                    name,
+                    axis=0,
+                    skipna=skipna,
+                    numeric_only=False,
+                    filter_type=filter_type,
+                    **kwds,
+                ).iloc[:0]
+                result.index = df.index
+                return result
+            df = df.T
+
+        # After possibly _get_data and transposing, we are now in the
+        #  simple case where we can use BlockManager.reduce
+        res = df._mgr.reduce(blk_func)
+        out = df._constructor(res).iloc[0]
+        if out_dtype is not None:
+            out = out.astype(out_dtype)
+        elif (df._mgr.get_dtypes() == object).any():
+            out = out.astype(object)
+        elif len(self) == 0 and name in ("sum", "prod"):
+            # Even if we are object dtype, follow numpy and return
+            #  float64, see test_apply_funcs_over_empty
+            out = out.astype(np.float64)
 
-        labels = self._get_agg_axis(axis)
-        result = self._constructor_sliced(result, index=labels)
-        return result
+        return out
 
     def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
         """
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -984,14 +984,10 @@ def reduce(self: T, func: Callable) -> T:
             # TODO NaT doesn't preserve dtype, so we need to ensure to create
             # a timedelta result array if original was timedelta
             # what if datetime results in timedelta? (eg std)
-            if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
-                result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
-            else:
-                # error: Argument 1 to "append" of "list" has incompatible type
-                # "ExtensionArray"; expected "ndarray"
-                result_arrays.append(
-                    sanitize_array([res], None)  # type: ignore[arg-type]
-                )
+            dtype = arr.dtype if res is NaT else None
+            result_arrays.append(
+                sanitize_array([res], None, dtype=dtype)  # type: ignore[arg-type]
+            )
 
         index = Index._simple_new(np.array([None], dtype=object))  # placeholder
         columns = self.items
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1535,7 +1535,12 @@ def _maybe_null_out(
                 result[null_mask] = None
     elif result is not NaT:
         if check_below_min_count(shape, mask, min_count):
-            result = np.nan
+            result_dtype = getattr(result, "dtype", None)
+            if is_float_dtype(result_dtype):
+                # error: Item "None" of "Optional[Any]" has no attribute "type"
+                result = result_dtype.type("nan")  # type: ignore[union-attr]
+            else:
+                result = np.nan
 
     return result
 
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -135,6 +135,8 @@ def test_apply_funcs_over_empty(func):
 
     result = df.apply(getattr(np, func))
     expected = getattr(df, func)()
+    if func in ("sum", "prod"):
+        expected = expected.astype(float)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -317,11 +317,11 @@ def wrapper(x):
             DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
         ],
     )
-    def test_stat_operators_attempt_obj_array(self, method, df):
+    def test_stat_operators_attempt_obj_array(self, method, df, axis):
         # GH#676
         assert df.values.dtype == np.object_
-        result = getattr(df, method)(1)
-        expected = getattr(df.astype("f8"), method)(1)
+        result = getattr(df, method)(axis=axis)
+        expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
@@ -424,7 +424,7 @@ def test_mean_mixed_string_decimal(self):
         with pytest.raises(TypeError, match="unsupported operand type"):
             df.mean()
         result = df[["A", "C"]].mean()
-        expected = Series([2.7, 681.6], index=["A", "C"])
+        expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
         tm.assert_series_equal(result, expected)
 
     def test_var_std(self, datetime_frame):
@@ -687,6 +687,29 @@ def test_std_timedelta64_skipna_false(self):
         expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
+    )
+    def test_std_datetime64_with_nat(
+        self, values, skipna, using_array_manager, request
+    ):
+        # GH#51335
+        if using_array_manager and (
+            not skipna or all(value is pd.NaT for value in values)
+        ):
+            mark = pytest.mark.xfail(
+                reason="GH#51446: Incorrect type inference on NaT in reduction result"
+            )
+            request.node.add_marker(mark)
+        df = DataFrame({"a": to_datetime(values)})
+        result = df.std(skipna=skipna)
+        if not skipna or all(value is pd.NaT for value in values):
+            expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
+        else:
+            # 86400000000000ns == 1 day
+            expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
+        tm.assert_series_equal(result, expected)
+
     def test_sum_corner(self):
         empty_frame = DataFrame()
 
@@ -697,6 +720,29 @@ def test_sum_corner(self):
         assert len(axis0) == 0
         assert len(axis1) == 0
 
+    @pytest.mark.parametrize(
+        "index",
+        [
+            tm.makeRangeIndex(0),
+            tm.makeDateIndex(0),
+            tm.makeNumericIndex(0, dtype=int),
+            tm.makeNumericIndex(0, dtype=float),
+            tm.makeDateIndex(0, freq="M"),
+            tm.makePeriodIndex(0),
+        ],
+    )
+    def test_axis_1_empty(self, all_reductions, index, using_array_manager):
+        df = DataFrame(columns=["a"], index=index)
+        result = getattr(df, all_reductions)(axis=1)
+        if all_reductions in ("any", "all"):
+            expected_dtype = "bool"
+        elif all_reductions == "count":
+            expected_dtype = "int64"
+        else:
+            expected_dtype = "object"
+        expected = Series([], index=index, dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
     @pytest.mark.parametrize("numeric_only", [None, True, False])
     def test_sum_prod_nanops(self, method, unit, numeric_only):
@@ -1418,6 +1464,21 @@ def test_preserve_timezone(self, initial: str, method):
         result = getattr(df, method)(axis=1)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
+        # GH#51242
+        val = to_datetime("1900-01-01", utc=True)
+        df = DataFrame(
+            {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
+        )
+        op = getattr(df, method)
+        result = op(axis=1, skipna=skipna)
+        if skipna:
+            expected = Series([pd.NaT, val, val])
+        else:
+            expected = Series([pd.NaT, pd.NaT, val])
+        tm.assert_series_equal(result, expected)
+
     def test_frame_any_with_timedelta(self):
         # GH#17667
         df = DataFrame(
@@ -1609,12 +1670,13 @@ def test_prod_sum_min_count_mixed_object():
 
 
 @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
-def test_reduction_axis_none_returns_scalar(method):
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_reduction_axis_none_returns_scalar(method, numeric_only):
     # GH#21597 As of 2.0, axis=None reduces over all axes.
 
     df = DataFrame(np.random.randn(4, 4))
 
-    result = getattr(df, method)(axis=None)
+    result = getattr(df, method)(axis=None, numeric_only=numeric_only)
     np_arr = df.to_numpy()
     if method in {"skew", "kurt"}:
         comp_mod = pytest.importorskip("scipy.stats")
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -21,14 +21,6 @@
 use_bn = nanops._USE_BOTTLENECK
 
 
-@pytest.fixture(params=[True, False])
-def skipna(request):
-    """
-    Fixture to pass skipna to nanops functions.
-    """
-    return request.param
-
-
 @pytest.fixture
 def disable_bottleneck(monkeypatch):
     with monkeypatch.context() as m: