From 330060c77ae6e8e3144ade59573aeb30c0aaac93 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 28 May 2021 15:22:16 +0100 Subject: [PATCH] REGR: fix DataFrame sum and prod with min_count and numeric_only --- doc/source/whatsnew/v1.2.5.rst | 1 + pandas/core/frame.py | 52 +++++++++++++++------------ pandas/tests/frame/test_reductions.py | 33 ++++++++++++----- 3 files changed, 54 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 60e146b2212eb..1d7b7a762e2ae 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6e71cb49596c8..2f4cff3aa405c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9856,36 +9856,42 @@ def _get_data() -> DataFrame: return out - assert numeric_only is None - data = self values = data.values - try: - result = func(values) + if numeric_only is None: - except TypeError: - # e.g. in nanops trying to convert strs to float + try: + result = func(values) - data = _get_data() - labels = data._get_agg_axis(axis) + except TypeError: + # e.g. in nanops trying to convert strs to float - values = data.values - with np.errstate(all="ignore"): - result = func(values) + data = _get_data() + labels = data._get_agg_axis(axis) - # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=5, - ) + values = data.values + with np.errstate(all="ignore"): + result = func(values) + + # columns have been dropped GH#41480 + arg_name = "numeric_only" + if name in ["all", "any"]: + arg_name = "bool_only" + warnings.warn( + "Dropping of nuisance columns in DataFrame reductions " + f"(with '{arg_name}=None') is deprecated; in a future " + "version this will raise TypeError. Select only valid " + "columns before calling the reduction.", + FutureWarning, + stacklevel=5, + ) + else: + if numeric_only: + data = _get_data() + labels = data._get_agg_axis(axis) + values = data.values + result = func(values) if hasattr(result, "dtype"): if filter_type == "bool" and notna(result).all(): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 564f5d20b0301..f84e980644753 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1,5 +1,6 @@ from datetime import timedelta from decimal import Decimal +import re from dateutil.tz import tzlocal import numpy as np @@ -154,7 +155,7 @@ def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only DataFrame with columns of type float float_string_frame : DataFrame DataFrame with both float and string columns - has_numeric_only : bool, default False + has_numeric_only : bool, default True Whether the method "opname" has the kwarg "numeric_only" """ # make sure works on mixed-type frame @@ -811,35 +812,36 @@ def test_sum_corner(self): assert len(axis1) == 0 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) - def test_sum_prod_nanops(self, method, unit): + @pytest.mark.parametrize("numeric_only", [None, True, False]) + def test_sum_prod_nanops(self, method, unit, numeric_only): idx = ["a", "b", "c"] df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) # The default - result = getattr(df, method)() + result = getattr(df, method)(numeric_only=numeric_only) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) # min_count=1 - result = getattr(df, method)(min_count=1) + result = getattr(df, method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 - result = getattr(df, method)(min_count=0) + result = getattr(df, method)(numeric_only=numeric_only, min_count=0) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) - result = getattr(df.iloc[1:], method)(min_count=1) + result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) - result = getattr(df, method)(min_count=5) + result = getattr(df, method)(numeric_only=numeric_only, min_count=5) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) - result = getattr(df, method)(min_count=6) + result = getattr(df, method)(numeric_only=numeric_only, min_count=6) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) @@ -1685,7 +1687,7 @@ def test_minmax_extensionarray(method, numeric_only): @pytest.mark.parametrize("meth", ["max", "min", "sum", "mean", "median"]) -def test_groupy_regular_arithmetic_equivalent(meth): +def test_groupby_regular_arithmetic_equivalent(meth): # GH#40660 df = DataFrame( {"a": [pd.Timedelta(hours=6), pd.Timedelta(hours=7)], "b": [12.1, 13.3]} @@ -1708,3 +1710,16 @@ def test_frame_mixed_numeric_object_with_timestamp(ts_value): result = df.sum() expected = Series([1, 1.1, "foo"], index=list("abc")) tm.assert_series_equal(result, expected) + + +def test_prod_sum_min_count_mixed_object(): + # https://github.com/pandas-dev/pandas/issues/41074 + df = DataFrame([1, "a", True]) + + result = df.prod(axis=0, min_count=1, numeric_only=False) + expected = Series(["a"]) + tm.assert_series_equal(result, expected) + + msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") + with pytest.raises(TypeError, match=msg): + df.sum(axis=0, min_count=1, numeric_only=False)