From d0bc83435b9dfdfaf4dfed0badb605ecee4b69a1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 10 Nov 2022 08:53:11 -0500 Subject: [PATCH 1/8] BUG: DataFrame reductions dtypes --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/conftest.py | 8 +++ pandas/core/frame.py | 83 +++++++++++--------------- pandas/core/internals/array_manager.py | 12 ++-- pandas/core/nanops.py | 10 +++- pandas/tests/apply/test_frame_apply.py | 2 + pandas/tests/frame/test_reductions.py | 46 ++++++++++++-- pandas/tests/test_nanops.py | 8 --- 8 files changed, 102 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..1c468877f4093 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1204,7 +1204,7 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) - Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`) -- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) +- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would have results coerced to float (:issue:`49603`) - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..25346abc936f3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -293,6 +293,14 @@ def ordered(request): return request.param +@pytest.fixture(params=[True, False]) +def skipna(request): + """ + Boolean 'skipna' parameter. + """ + return request.param + + @pytest.fixture(params=["first", "last", False]) def keep(request): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2650090a3f61a..98a3a99d2d47c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -141,7 +141,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -10461,54 +10460,44 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only or axis == 0: - # For numeric_only non-None and axis non-None, we know - # which blocks to use and no try/except is needed. - # For numeric_only=None only the case with axis==0 and no object - # dtypes are unambiguous can be handled with BlockManager.reduce - # Case with EAs see GH#35881 - df = self - if numeric_only: - df = _get_data() - if axis == 1: - df = df.T - axis = 0 - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) - - return out - - assert not numeric_only and axis in (1, None) - - data = self - values = data.values - result = func(values) - - if hasattr(result, "dtype"): - if filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) - elif filter_type is None and is_object_dtype(result.dtype): - try: - result = result.astype(np.float64) - except (ValueError, TypeError): - # try to coerce to the original dtypes item by item if we can - pass - + # Case with EAs see GH#35881 + df = self + if numeric_only: + df = _get_data() if axis is None: - return result + return func(df.values) + elif axis == 1: + if len(df.index) == 0: + # Taking a transpose would result in no columns, losing the dtype. + # In the empty case, reducing along axis 0 or 1 gives the same + # result dtype, so reduce with axis=0 and ignore values + result = df._reduce( + op, + name, + axis=0, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ).iloc[:0] + result.index = df.index + return result + df = df.T + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res = df._mgr.reduce(blk_func) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + elif (df._mgr.get_dtypes() == object).any(): + out = out.astype(object) + elif len(self) == 0 and name in ("sum", "prod"): + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) - labels = self._get_agg_axis(axis) - result = self._constructor_sliced(result, index=labels) - return result + return out def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 431313e3a2960..1f9b7f1f17777 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -983,14 +983,10 @@ def reduce(self: T, func: Callable) -> T: # TODO NaT doesn't preserve dtype, so we need to ensure to create # a timedelta result array if original was timedelta # what if datetime results in timedelta? (eg std) - if res is NaT and is_timedelta64_ns_dtype(arr.dtype): - result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]")) - else: - # error: Argument 1 to "append" of "list" has incompatible type - # "ExtensionArray"; expected "ndarray" - result_arrays.append( - sanitize_array([res], None) # type: ignore[arg-type] - ) + dtype = arr.dtype if res is NaT else None + result_arrays.append( + sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type] + ) index = Index._simple_new(np.array([None], dtype=object)) # placeholder columns = self.items diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0af851669820e..f05b13a94c2bd 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1533,7 +1533,15 @@ def _maybe_null_out( result[null_mask] = None elif result is not NaT: if check_below_min_count(shape, mask, min_count): - result = np.nan + result_dtype = getattr(result, "dtype", None) + if is_float_dtype(result_dtype): + # Preserve dtype when possible + # mypy doesn't infer result_dtype is not None + result = getattr( + np, f"float{8 * result_dtype.itemsize}" # type: ignore[union-attr] + )("nan") + else: + result = np.nan return result diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 7aaad4d2ad081..1f2766edce135 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -135,6 +135,8 @@ def test_apply_funcs_over_empty(func): result = df.apply(getattr(np, func)) expected = getattr(df, func)() + if func in ("sum", "prod"): + expected = expected.astype(float) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3e6074971352d..2be95b1e437c8 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -318,11 +318,11 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) - def test_stat_operators_attempt_obj_array(self, method, df): + def test_stat_operators_attempt_obj_array(self, method, df, axis): # GH#676 assert df.values.dtype == np.object_ - result = getattr(df, method)(1) - expected = getattr(df.astype("f8"), method)(1) + result = getattr(df, method)(axis=axis) + expected = getattr(df.astype("f8"), method)(axis=axis).astype(object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) @@ -425,7 +425,7 @@ def test_mean_mixed_string_decimal(self): with pytest.raises(TypeError, match="unsupported operand type"): df.mean() result = df[["A", "C"]].mean() - expected = Series([2.7, 681.6], index=["A", "C"]) + expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) tm.assert_series_equal(result, expected) def test_var_std(self, datetime_frame): @@ -699,6 +699,29 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 + @pytest.mark.parametrize( + "index", + [ + tm.makeRangeIndex(0), + tm.makeDateIndex(0), + tm.makeNumericIndex(0, dtype=int), + tm.makeNumericIndex(0, dtype=float), + tm.makeDateIndex(0, freq="M"), + tm.makePeriodIndex(0), + ], + ) + def test_axis_1_empty(self, all_reductions, index, using_array_manager): + df = DataFrame(columns=["a"], index=index) + result = getattr(df, all_reductions)(axis=1) + if all_reductions in ("any", "all"): + expected_dtype = "bool" + elif all_reductions == "count": + expected_dtype = "int" + else: + expected_dtype = "object" + expected = Series([], index=index, dtype=expected_dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): @@ -1423,6 +1446,21 @@ def test_preserve_timezone(self, initial: str, method): result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method", ["min", "max"]) + def test_minmax_tzaware_skipna_axis_1(self, method, skipna): + # GH#51242 + val = to_datetime("1900-01-01", utc=True) + df = DataFrame( + {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])} + ) + op = getattr(df, method) + result = op(axis=1, skipna=skipna) + if skipna: + expected = Series([pd.NaT, val, val]) + else: + expected = Series([pd.NaT, pd.NaT, val]) + tm.assert_series_equal(result, expected) + def test_frame_any_with_timedelta(self): # GH#17667 df = DataFrame( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index d6deafb9012ff..e9d438bee6a83 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -21,14 +21,6 @@ use_bn = nanops._USE_BOTTLENECK -@pytest.fixture(params=[True, False]) -def skipna(request): - """ - Fixture to pass skipna to nanops functions. - """ - return request.param - - @pytest.fixture def disable_bottleneck(monkeypatch): with monkeypatch.context() as m: From 873c309f8b36380d25158b399c6ca1774376821a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Feb 2023 16:27:01 -0500 Subject: [PATCH 2/8] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1c468877f4093..9c401c8445e24 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1208,6 +1208,7 @@ Numeric - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with NA tz-aware data and ``axis=1`` would return incorrect results (:issue:`51242`) Conversion ^^^^^^^^^^ From c10c6b36211006058da89059ad687dce84304d78 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 12 Feb 2023 13:17:21 -0500 Subject: [PATCH 3/8] dtype fixup; whatsnew --- doc/source/whatsnew/v2.0.0.rst | 5 ++--- pandas/tests/frame/test_reductions.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9c401c8445e24..a88bfb3142f62 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -796,7 +796,7 @@ Other API changes - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`) -- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`) +- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`) - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`) - @@ -1204,11 +1204,10 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) - Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`) -- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would have results coerced to float (:issue:`49603`) - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) -- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with NA tz-aware data and ``axis=1`` would return incorrect results (:issue:`51242`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing NA and ``axis=1`` would return incorrect results (:issue:`51242`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 2be95b1e437c8..39faa1c4e8bd9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -716,7 +716,7 @@ def test_axis_1_empty(self, all_reductions, index, using_array_manager): if all_reductions in ("any", "all"): expected_dtype = "bool" elif all_reductions == "count": - expected_dtype = "int" + expected_dtype = "int64" else: expected_dtype = "object" expected = Series([], index=index, dtype=expected_dtype) From 115b5c43038db40397c51b7443cae400396306ec Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 13 Feb 2023 15:55:22 -0500 Subject: [PATCH 4/8] Add test, fix whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/frame/test_reductions.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a88bfb3142f62..bc65b4f86a6ba 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1207,7 +1207,7 @@ Numeric - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) -- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing NA and ``axis=1`` would return incorrect results (:issue:`51242`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 39faa1c4e8bd9..3b28ada6b712e 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1652,12 +1652,13 @@ def test_prod_sum_min_count_mixed_object(): @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"]) -def test_reduction_axis_none_returns_scalar(method): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_reduction_axis_none_returns_scalar(method, numeric_only): # GH#21597 As of 2.0, axis=None reduces over all axes. df = DataFrame(np.random.randn(4, 4)) - result = getattr(df, method)(axis=None) + result = getattr(df, method)(axis=None, numeric_only=numeric_only) np_arr = df.to_numpy() if method in {"skew", "kurt"}: comp_mod = pytest.importorskip("scipy.stats") From a93adf0c9e30ef4140052d62d3258549575d48dc Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 16 Feb 2023 16:40:55 -0500 Subject: [PATCH 5/8] Add datetime test --- pandas/tests/frame/test_reductions.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3e1ec2c377abd..49aef56b8b4c0 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -687,6 +687,20 @@ def test_std_timedelta64_skipna_false(self): expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]] + ) + def test_std_datetime64_with_nat(self, values, skipna): + # GH#51335 + df = DataFrame({"a": to_datetime(values)}) + result = df.std(skipna=skipna) + if not skipna or all(value is pd.NaT for value in values): + expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]") + else: + # 86400000000000ns == 1 day + expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]") + tm.assert_series_equal(result, expected) + def test_sum_corner(self): empty_frame = DataFrame() From 9471c13631d2dd4f0886991ba2264d1a42878e17 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 16 Feb 2023 17:44:08 -0500 Subject: [PATCH 6/8] result_dtype.type --- pandas/core/nanops.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a4f01bacb24b7..2f469e012a23c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1537,11 +1537,7 @@ def _maybe_null_out( if check_below_min_count(shape, mask, min_count): result_dtype = getattr(result, "dtype", None) if is_float_dtype(result_dtype): - # Preserve dtype when possible - # mypy doesn't infer result_dtype is not None - result = getattr( - np, f"float{8 * result_dtype.itemsize}" # type: ignore[union-attr] - )("nan") + result = result_dtype.type("nan") else: result = np.nan From ecee6ccffa67c37ea6b600b6ea77abd3d81d82c1 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 16 Feb 2023 22:01:02 -0500 Subject: [PATCH 7/8] xfail --- pandas/tests/frame/test_reductions.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 49aef56b8b4c0..28809e2ecb788 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -690,8 +690,17 @@ def test_std_timedelta64_skipna_false(self): @pytest.mark.parametrize( "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]] ) - def test_std_datetime64_with_nat(self, values, skipna): + def test_std_datetime64_with_nat( + self, values, skipna, using_array_manager, request + ): # GH#51335 + if using_array_manager and ( + not skipna or all(value is pd.NaT for value in values) + ): + mark = pytest.mark.xfail( + reason="GH#51446: Incorrect type inference on NaT in reduction result" + ) + request.node.add_marker(mark) df = DataFrame({"a": to_datetime(values)}) result = df.std(skipna=skipna) if not skipna or all(value is pd.NaT for value in values): From c9eaf90aac94435bae8cd74ec00d12603e1e6423 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 17 Feb 2023 16:19:32 -0500 Subject: [PATCH 8/8] type-ignore --- pandas/core/nanops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2f469e012a23c..60c0d04ef28b0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1537,7 +1537,8 @@ def _maybe_null_out( if check_below_min_count(shape, mask, min_count): result_dtype = getattr(result, "dtype", None) if is_float_dtype(result_dtype): - result = result_dtype.type("nan") + # error: Item "None" of "Optional[Any]" has no attribute "type" + result = result_dtype.type("nan") # type: ignore[union-attr] else: result = np.nan