From 898759d10cf808e7be69bf6b30eaba395b3df4fa Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 10 Nov 2022 08:53:11 -0500 Subject: [PATCH 1/6] BUG: DataFrame reductions with object dtype and axis=1 --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/frame.py | 83 +++++++++++---------------- pandas/tests/frame/test_reductions.py | 29 ++++++++++ 3 files changed, 63 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 73a75667b46da..ca1d708786f6f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -571,7 +571,7 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) -- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) +- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would have results unnecessarily coerced to float; coercion still occurs for reductions that necessarily result in floats (``mean``, ``var``, ``std``, ``skew``) (:issue:`49603`) - Conversion diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1627a7add25ed..fe51d7b24cc50 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -134,7 +134,6 @@ is_iterator, is_list_like, is_numeric_dtype, - is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -10522,57 +10521,41 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only or axis == 0: - # For numeric_only non-None and axis non-None, we know - # which blocks to use and no try/except is needed. - # For numeric_only=None only the case with axis==0 and no object - # dtypes are unambiguous can be handled with BlockManager.reduce - # Case with EAs see GH#35881 - df = self - if numeric_only: - df = _get_data() - if axis == 1: - df = df.T - axis = 0 - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func, ignore_failures=False) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) - - if numeric_only is None and out.shape[0] != df.shape[1]: - # columns have been dropped GH#41480 - com.deprecate_numeric_only_default( - type(self), name, deprecate_none=True - ) - - return out - - assert not numeric_only and axis == 1 - - data = self - values = data.values - result = func(values) + # Case with EAs see GH#35881 + df = self + if numeric_only: + df = _get_data() + if axis == 1: + if len(df.index) == 0: + # Taking a transpose would result in no columns, losing the dtype. + # In the empty case, reducing along axis 0 or 1 gives the same + # result dtype, so reduce with axis=0 and ignore values + result = df._reduce( + op, + name, + axis=0, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ).iloc[:0] + result.index = df.index + return result + df = df.T + axis = 0 - if hasattr(result, "dtype"): - if filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) - elif filter_type is None and is_object_dtype(result.dtype): - try: - result = result.astype(np.float64) - except (ValueError, TypeError): - # try to coerce to the original dtypes item by item if we can - pass + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res, _ = df._mgr.reduce(blk_func, ignore_failures=False) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) - labels = self._get_agg_axis(axis) - result = self._constructor_sliced(result, index=labels) - return result + return out def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0e5c6057b9a61..4262fde36c373 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -331,6 +331,8 @@ def test_stat_operators_attempt_obj_array(self, method, df): assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) + if method in ("sum", "prod", "min", "max"): + expected = expected.astype(object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) @@ -718,6 +720,33 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 + @pytest.mark.parametrize( + "index", + [ + tm.makeRangeIndex(0), + tm.makeDateIndex(0), + tm.makeNumericIndex(0, dtype=int), + tm.makeNumericIndex(0, dtype=float), + tm.makeDateIndex(0, freq="M"), + tm.makePeriodIndex(0), + ], + ) + def test_axis_1_empty(self, reduction_functions, index): + df = DataFrame(columns=["a"], index=index) + result = getattr(df, reduction_functions)(axis=1) + expected_dtype = { + "any": "bool", + "all": "bool", + "count": "int", + "sum": "float", + "prod": "float", + "skew": "float", + "kurt": "float", + "sem": "float", + }.get(reduction_functions, "object") + expected = Series([], index=index, dtype=expected_dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): From cd696aefd2e85653d2c2afb8e70a9f6ea36569de Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 08:00:13 -0500 Subject: [PATCH 2/6] Use intp --- pandas/tests/frame/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 4262fde36c373..df536e409f510 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -737,7 +737,7 @@ def test_axis_1_empty(self, reduction_functions, index): expected_dtype = { "any": "bool", "all": "bool", - "count": "int", + "count": "intp", "sum": "float", "prod": "float", "skew": "float", From 9de5b27387e39bbfc245dcaaf9e0c58550ca73cb Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 23 Nov 2022 07:40:56 -0500 Subject: [PATCH 3/6] Merge cleanup --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f20fe62258597..272f7c3cdc156 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -132,6 +132,7 @@ is_integer_dtype, is_iterator, is_list_like, + is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, From a37377557abcdaac07ebbc5d0927fc06d9eab41c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 2 Dec 2022 11:17:54 -0500 Subject: [PATCH 4/6] Update fixture name --- pandas/tests/frame/test_reductions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 27349e0bc74e9..64a08b4e5bac4 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -713,9 +713,9 @@ def test_sum_corner(self): tm.makePeriodIndex(0), ], ) - def test_axis_1_empty(self, reduction_functions, index): + def test_axis_1_empty(self, all_reductions, index): df = DataFrame(columns=["a"], index=index) - result = getattr(df, reduction_functions)(axis=1) + result = getattr(df, all_reductions)(axis=1) expected_dtype = { "any": "bool", "all": "bool", @@ -725,7 +725,7 @@ def test_axis_1_empty(self, reduction_functions, index): "skew": "float", "kurt": "float", "sem": "float", - }.get(reduction_functions, "object") + }.get(all_reductions, "object") expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) From 10568da5945c3caad5a63beef44496912d648744 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 6 Dec 2022 16:22:12 -0500 Subject: [PATCH 5/6] WIP --- pandas/core/frame.py | 2 +- pandas/core/nanops.py | 12 +++++++++--- pandas/tests/frame/test_reductions.py | 25 +++++++++++++++++++++---- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2631150882284..ba3b6e301fd92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10547,7 +10547,7 @@ def _get_data() -> DataFrame: out = df._constructor(res).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + if len(self) == 0 and name in ["sum", "prod"]: # Even if we are object dtype, follow numpy and return # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b8e2b1fafe326..744cb1eae00d3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -622,7 +622,7 @@ def nansum( 3.0 """ values, mask, dtype, dtype_max, _ = _get_values( - values, skipna, fill_value=0, mask=mask + values, skipna, fill_value=0.0, mask=mask ) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -1389,7 +1389,7 @@ def nanprod( if skipna and mask is not None: values = values.copy() - values[mask] = 1 + values[mask] = 1.0 result = values.prod(axis) # error: Incompatible return value type (got "Union[ndarray, float]", expected # "float") @@ -1500,7 +1500,13 @@ def _maybe_null_out( result[null_mask] = None elif result is not NaT: if check_below_min_count(shape, mask, min_count): - result = np.nan + result_dtype = getattr(result, "dtype", None) + if is_float_dtype(result_dtype): + # Preserve dtype when possible + + result = np.array([np.nan], dtype=f"f{result_dtype.itemsize}")[0] + else: + result = np.nan return result diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 64a08b4e5bac4..ce0eccd0fdec2 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -323,12 +323,12 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) - def test_stat_operators_attempt_obj_array(self, method, df): + def test_stat_operators_attempt_obj_array(self, method, df, using_array_manager): # GH#676 assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) - if method in ("sum", "prod", "min", "max"): + if not using_array_manager and method in ("sum", "prod", "min", "max"): expected = expected.astype(object) tm.assert_series_equal(result, expected) @@ -713,7 +713,7 @@ def test_sum_corner(self): tm.makePeriodIndex(0), ], ) - def test_axis_1_empty(self, all_reductions, index): + def test_axis_1_empty(self, all_reductions, index, using_array_manager): df = DataFrame(columns=["a"], index=index) result = getattr(df, all_reductions)(axis=1) expected_dtype = { @@ -726,6 +726,15 @@ def test_axis_1_empty(self, all_reductions, index): "kurt": "float", "sem": "float", }.get(all_reductions, "object") + if using_array_manager and all_reductions in ( + "max", + "min", + "mean", + "std", + "var", + "median", + ): + expected_dtype = "float" expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -1368,7 +1377,9 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) - def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): + def test_min_max_dt64_with_NaT_skipna_false( + self, request, tz_naive_fixture, using_array_manager + ): # GH#36907 tz = tz_naive_fixture if isinstance(tz, tzlocal) and is_platform_windows(): @@ -1390,12 +1401,18 @@ def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype + if using_array_manager and tz is not None: + expected = expected.astype(object) + tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype + if using_array_manager and tz is not None: + expected = expected.astype(object) + tm.assert_series_equal(res, expected) def test_min_max_dt64_api_consistency_with_NaT(self): From d0307d9c1135284bb0e6d9689514102842cdd534 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 10 Dec 2022 13:42:48 -0500 Subject: [PATCH 6/6] Fixups --- pandas/core/frame.py | 5 ----- pandas/core/internals/array_manager.py | 3 ++- pandas/tests/frame/test_reductions.py | 31 ++++++++++++++------------ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dafe09c487d26..5154a777789c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10492,10 +10492,6 @@ def _reduce( axis = self._get_axis_number(axis) assert axis in [0, 1] - def func(values: np.ndarray): - # We only use this in the case that operates on self.values - return op(values, axis=axis, skipna=skipna, **kwds) - def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype) and not isinstance( @@ -10536,7 +10532,6 @@ def _get_data() -> DataFrame: result.index = df.index return result df = df.T - axis = 0 # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 2fffd8ebe2f72..6ef4fe2b5267d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -986,8 +986,9 @@ def reduce(self: T, func: Callable) -> T: else: # error: Argument 1 to "append" of "list" has incompatible type # "ExtensionArray"; expected "ndarray" + dtype = arr.dtype if res is NaT else None result_arrays.append( - sanitize_array([res], None) # type: ignore[arg-type] + sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type] ) index = Index._simple_new(np.array([None], dtype=object)) # placeholder diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e1efd9f90cf69..5b96b4cd32864 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -323,13 +323,26 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) - def test_stat_operators_attempt_obj_array(self, method, df, using_array_manager): + def test_stat_operators_attempt_obj_array( + self, method, df, using_array_manager, axis + ): # GH#676 assert df.values.dtype == np.object_ - result = getattr(df, method)(1) - expected = getattr(df.astype("f8"), method)(1) + result = getattr(df, method)(axis=axis) + expected = getattr(df.astype("f8"), method)(axis=axis) + # With values an np.array with dtype object: + # - When using blocks, `values.sum(axis=1, ...)` returns a np.array of dim 1 + # and this remains object dtype + # - When using arrays, `values.sum(axis=0, ...)` returns a Python float if not using_array_manager and method in ("sum", "prod", "min", "max"): expected = expected.astype(object) + elif ( + using_array_manager + and axis in (0, "index") + and method in ("min", "max") + and 0 in df.columns + ): + expected = expected.astype(int) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) @@ -1377,9 +1390,7 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) - def test_min_max_dt64_with_NaT_skipna_false( - self, request, tz_naive_fixture, using_array_manager - ): + def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture if isinstance(tz, tzlocal) and is_platform_windows(): @@ -1400,19 +1411,11 @@ def test_min_max_dt64_with_NaT_skipna_false( res = df.min(axis=1, skipna=False) expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype - - if using_array_manager and tz is not None: - expected = expected.astype(object) - tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype - - if using_array_manager and tz is not None: - expected = expected.astype(object) - tm.assert_series_equal(res, expected) def test_min_max_dt64_api_consistency_with_NaT(self):